From d695a2eadde2d0799b5c663690212ec022d61c1c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sat, 30 May 2020 20:11:37 -0300 Subject: Speed up loading large databases on startup Batching the sha1sum work is dramatically faster. Signed-off-by: Jason Gunthorpe --- cloud_mdir_sync/messages.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/cloud_mdir_sync/messages.py b/cloud_mdir_sync/messages.py index 9490d6e..d7044ae 100644 --- a/cloud_mdir_sync/messages.py +++ b/cloud_mdir_sync/messages.py @@ -224,20 +224,27 @@ class MessageDB(object): stuff we have already downloaded and is crash safe as we rehash every file. Accidental duplicates are pruned along the way.""" hashes = set() - for fn in os.listdir(hashes_dir): - if fn.startswith("."): - continue - - # Since we don't use sync the files can be corrupted, check them. - ffn = os.path.join(hashes_dir, fn) - ch = self._sha1_fn(ffn) - if fn == ch: - hashes.add(ch) - st = os.stat(ffn) - inode = (st.st_ino, st.st_size, st.st_mtime, st.st_ctime) - self.inode_hashes[inode] = ch - else: - os.unlink(ffn) + # Since we don't use sync the files can be corrupted, check them. + dirl = [fn for fn in os.listdir(hashes_dir) if not fn.startswith(".")] + while dirl: + chunk = dirl[:500] + del dirl[:500] + sha1 = subprocess.check_output(["sha1sum"] + chunk, + cwd=hashes_dir).decode() + lines = sha1.splitlines() + assert(len(chunk) == len(lines)) + for fn,ln in zip(chunk,lines): + ch, _, check_fn = ln.partition(' ') + assert(check_fn == fn) + + ffn = os.path.join(hashes_dir, fn) + if fn == ch: + hashes.add(ch) + st = os.stat(ffn) + inode = (st.st_ino, st.st_size, st.st_mtime, st.st_ctime) + self.inode_hashes[inode] = ch + else: + os.unlink(ffn) self.file_hashes.update(hashes) def have_content(self, msg: Message): -- cgit v1.2.3