From d695a2eadde2d0799b5c663690212ec022d61c1c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Sat, 30 May 2020 20:11:37 -0300
Subject: Speed up loading large databases on startup

Batching the sha1sum work is dramatically faster.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 cloud_mdir_sync/messages.py | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/cloud_mdir_sync/messages.py b/cloud_mdir_sync/messages.py
index 9490d6e..d7044ae 100644
--- a/cloud_mdir_sync/messages.py
+++ b/cloud_mdir_sync/messages.py
@@ -224,20 +224,27 @@ class MessageDB(object):
         stuff we have already downloaded and is crash safe as we rehash every
         file. Accidental duplicates are pruned along the way."""
         hashes = set()
-        for fn in os.listdir(hashes_dir):
-            if fn.startswith("."):
-                continue
-
-            # Since we don't use sync the files can be corrupted, check them.
-            ffn = os.path.join(hashes_dir, fn)
-            ch = self._sha1_fn(ffn)
-            if fn == ch:
-                hashes.add(ch)
-                st = os.stat(ffn)
-                inode = (st.st_ino, st.st_size, st.st_mtime, st.st_ctime)
-                self.inode_hashes[inode] = ch
-            else:
-                os.unlink(ffn)
+        # Since we don't use sync the files can be corrupted, check them.
+        dirl = [fn for fn in os.listdir(hashes_dir) if not fn.startswith(".")]
+        while dirl:
+            chunk = dirl[:500]
+            del dirl[:500]
+            sha1 = subprocess.check_output(["sha1sum"] + chunk,
+                                           cwd=hashes_dir).decode()
+            lines = sha1.splitlines()
+            assert(len(chunk) == len(lines))
+            for fn,ln in zip(chunk,lines):
+                ch, _, check_fn = ln.partition('  ')
+                assert(check_fn == fn)
+
+                ffn = os.path.join(hashes_dir, fn)
+                if fn == ch:
+                    hashes.add(ch)
+                    st = os.stat(ffn)
+                    inode = (st.st_ino, st.st_size, st.st_mtime, st.st_ctime)
+                    self.inode_hashes[inode] = ch
+                else:
+                    os.unlink(ffn)
         self.file_hashes.update(hashes)
 
     def have_content(self, msg: Message):
-- 
cgit v1.2.3