diff options
author | Jason Gunthorpe <jgg@mellanox.com> | 2020-05-30 20:11:37 -0300 |
---|---|---|
committer | Jason Gunthorpe <jgg@mellanox.com> | 2020-05-30 20:11:37 -0300 |
commit | d695a2eadde2d0799b5c663690212ec022d61c1c (patch) | |
tree | bc1ab4055962b15f167a00472e79344f24849f0c | |
parent | 9d4aa4c3264127fffbbcab99dfa303b2214749d9 (diff) | |
download | cloud_mdir_sync-d695a2eadde2d0799b5c663690212ec022d61c1c.tar.gz cloud_mdir_sync-d695a2eadde2d0799b5c663690212ec022d61c1c.tar.bz2 cloud_mdir_sync-d695a2eadde2d0799b5c663690212ec022d61c1c.zip |
Speed up loading large databases on startup
Batching the sha1sum work is dramatically faster.
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
-rw-r--r-- | cloud_mdir_sync/messages.py | 35 |
1 files changed, 21 insertions, 14 deletions
diff --git a/cloud_mdir_sync/messages.py b/cloud_mdir_sync/messages.py index 9490d6e..d7044ae 100644 --- a/cloud_mdir_sync/messages.py +++ b/cloud_mdir_sync/messages.py @@ -224,20 +224,27 @@ class MessageDB(object): stuff we have already downloaded and is crash safe as we rehash every file. Accidental duplicates are pruned along the way.""" hashes = set() - for fn in os.listdir(hashes_dir): - if fn.startswith("."): - continue - - # Since we don't use sync the files can be corrupted, check them. - ffn = os.path.join(hashes_dir, fn) - ch = self._sha1_fn(ffn) - if fn == ch: - hashes.add(ch) - st = os.stat(ffn) - inode = (st.st_ino, st.st_size, st.st_mtime, st.st_ctime) - self.inode_hashes[inode] = ch - else: - os.unlink(ffn) + # Since we don't use sync the files can be corrupted, check them. + dirl = [fn for fn in os.listdir(hashes_dir) if not fn.startswith(".")] + while dirl: + chunk = dirl[:500] + del dirl[:500] + sha1 = subprocess.check_output(["sha1sum"] + chunk, + cwd=hashes_dir).decode() + lines = sha1.splitlines() + assert(len(chunk) == len(lines)) + for fn,ln in zip(chunk,lines): + ch, _, check_fn = ln.partition(' ') + assert(check_fn == fn) + + ffn = os.path.join(hashes_dir, fn) + if fn == ch: + hashes.add(ch) + st = os.stat(ffn) + inode = (st.st_ino, st.st_size, st.st_mtime, st.st_ctime) + self.inode_hashes[inode] = ch + else: + os.unlink(ffn) self.file_hashes.update(hashes) def have_content(self, msg: Message): |