aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@mellanox.com>2020-05-30 20:11:37 -0300
committerJason Gunthorpe <jgg@mellanox.com>2020-05-30 20:11:37 -0300
commitd695a2eadde2d0799b5c663690212ec022d61c1c (patch)
treebc1ab4055962b15f167a00472e79344f24849f0c
parent9d4aa4c3264127fffbbcab99dfa303b2214749d9 (diff)
downloadcloud_mdir_sync-d695a2eadde2d0799b5c663690212ec022d61c1c.tar.gz
cloud_mdir_sync-d695a2eadde2d0799b5c663690212ec022d61c1c.tar.bz2
cloud_mdir_sync-d695a2eadde2d0799b5c663690212ec022d61c1c.zip
Speed up loading large databases on startup
Batching the sha1sum work is dramatically faster. Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
-rw-r--r--cloud_mdir_sync/messages.py35
1 files changed, 21 insertions, 14 deletions
diff --git a/cloud_mdir_sync/messages.py b/cloud_mdir_sync/messages.py
index 9490d6e..d7044ae 100644
--- a/cloud_mdir_sync/messages.py
+++ b/cloud_mdir_sync/messages.py
@@ -224,20 +224,27 @@ class MessageDB(object):
stuff we have already downloaded and is crash safe as we rehash every
file. Accidental duplicates are pruned along the way."""
hashes = set()
- for fn in os.listdir(hashes_dir):
- if fn.startswith("."):
- continue
-
- # Since we don't use sync the files can be corrupted, check them.
- ffn = os.path.join(hashes_dir, fn)
- ch = self._sha1_fn(ffn)
- if fn == ch:
- hashes.add(ch)
- st = os.stat(ffn)
- inode = (st.st_ino, st.st_size, st.st_mtime, st.st_ctime)
- self.inode_hashes[inode] = ch
- else:
- os.unlink(ffn)
+ # Since we don't use sync the files can be corrupted, check them.
+ dirl = [fn for fn in os.listdir(hashes_dir) if not fn.startswith(".")]
+ while dirl:
+ chunk = dirl[:500]
+ del dirl[:500]
+ sha1 = subprocess.check_output(["sha1sum"] + chunk,
+ cwd=hashes_dir).decode()
+ lines = sha1.splitlines()
+ assert(len(chunk) == len(lines))
+ for fn,ln in zip(chunk,lines):
+ ch, _, check_fn = ln.partition(' ')
+ assert(check_fn == fn)
+
+ ffn = os.path.join(hashes_dir, fn)
+ if fn == ch:
+ hashes.add(ch)
+ st = os.stat(ffn)
+ inode = (st.st_ino, st.st_size, st.st_mtime, st.st_ctime)
+ self.inode_hashes[inode] = ch
+ else:
+ os.unlink(ffn)
self.file_hashes.update(hashes)
def have_content(self, msg: Message):