aboutsummaryrefslogtreecommitdiffstats
path: root/scripts/dl_github_archive.py
diff options
context:
space:
mode:
authorYousong Zhou <yszhou4tech@gmail.com>2018-06-28 18:27:27 +0800
committerYousong Zhou <yszhou4tech@gmail.com>2018-07-05 01:30:57 +0800
commit04b9f8587370f96366c6e53fb411473279ba7c02 (patch)
treee94957b40825486a70c170db1fbae225cff9307b /scripts/dl_github_archive.py
parente48ea13b3bac5393d6400156ddb066ec5de2ea4e (diff)
downloadupstream-04b9f8587370f96366c6e53fb411473279ba7c02.tar.gz
upstream-04b9f8587370f96366c6e53fb411473279ba7c02.tar.bz2
upstream-04b9f8587370f96366c6e53fb411473279ba7c02.zip
scripts/dl_github_archive.py: rename from download.py
- Make the code more GitHub-specific - Requires mirror hash to work with .gitattributes - Use different API depending on whether PKG_SOURCE_VERSION is a complete commit id or other ref types like tags - Fix removing symbolic link - pre-clean dir_untar for possible leftovers from previous run Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
Diffstat (limited to 'scripts/dl_github_archive.py')
-rwxr-xr-xscripts/dl_github_archive.py426
1 files changed, 426 insertions, 0 deletions
diff --git a/scripts/dl_github_archive.py b/scripts/dl_github_archive.py
new file mode 100755
index 0000000000..5a5a016e37
--- /dev/null
+++ b/scripts/dl_github_archive.py
@@ -0,0 +1,426 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
+#
+# This is free software, licensed under the GNU General Public License v2.
+# See /LICENSE for more information.
+
+import argparse
+import calendar
+import datetime
+import errno
+import fcntl
+import hashlib
+import json
+import os
+import os.path
+import re
+import shutil
+import ssl
+import subprocess
+import sys
+import time
+import urllib2
+
+TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
+TMPDIR_DL = os.path.join(TMPDIR, 'dl')
+
+
+class PathException(Exception): pass
+class DownloadGitHubError(Exception): pass
+
+
+class Path(object):
+ """Context class for preparing and cleaning up directories.
+
+ If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
+
+ If ``path`` ``isdir``, then it will be created on context enter.
+
+ If ``keep`` is True, then ``path`` will NOT be removed on context exit
+ """
+
+ def __init__(self, path, isdir=True, preclean=False, keep=False):
+ self.path = path
+ self.isdir = isdir
+ self.preclean = preclean
+ self.keep = keep
+
+ def __enter__(self):
+ if self.preclean:
+ self.rm_all(self.path)
+ if self.isdir:
+ self.mkdir_all(self.path)
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ if not self.keep:
+ self.rm_all(self.path)
+
+ @staticmethod
+ def mkdir_all(path):
+ """Same as mkdir -p."""
+ names = os.path.split(path)
+ p = ''
+ for name in names:
+ p = os.path.join(p, name)
+ Path._mkdir(p)
+
+ @staticmethod
+ def _rmdir_dir(dir_):
+ names = Path._listdir(dir_)
+ for name in names:
+ p = os.path.join(dir_, name)
+ Path.rm_all(p)
+ Path._rmdir(dir_)
+
+ @staticmethod
+ def _mkdir(path):
+ Path._os_func(os.mkdir, path, errno.EEXIST)
+
+ @staticmethod
+ def _rmdir(path):
+ Path._os_func(os.rmdir, path, errno.ENOENT)
+
+ @staticmethod
+ def _remove(path):
+ Path._os_func(os.remove, path, errno.ENOENT)
+
+ @staticmethod
+ def _listdir(path):
+ return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
+
+ @staticmethod
+ def _os_func(func, path, errno, default=None):
+ """Call func(path) in an idempotent way.
+
+ On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
+ return ``default``, otherwise, re-raise
+ """
+ try:
+ return func(path)
+ except OSError as e:
+ if e.errno == errno:
+ return default
+ else:
+ raise
+
+ @staticmethod
+ def rm_all(path):
+ """Same as rm -r."""
+ if os.path.islink(path):
+ Path._remove(path)
+ elif os.path.isdir(path):
+ Path._rmdir_dir(path)
+ else:
+ Path._remove(path)
+
+ @staticmethod
+ def untar(path, into=None):
+ """Extract tarball at ``path`` into subdir ``into``.
+
+ return subdir name if and only if there exists one, otherwise raise PathException
+ """
+ args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
+ subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
+ dirs = os.listdir(into)
+ if len(dirs) == 1:
+ return dirs[0]
+ else:
+ raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
+
+ @staticmethod
+ def tar(path, subdir, into=None, ts=None):
+ """Pack ``path`` into tarball ``into``."""
+ # --sort=name requires a recent build of GNU tar
+ args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name']
+ args += ['-C', path, '-cf', into, subdir]
+ envs = os.environ.copy()
+ if ts is not None:
+ args.append('--mtime=@%d' % ts)
+ if into.endswith('.xz'):
+ envs['XZ_OPT'] = '-7e'
+ args.append('-J')
+ elif into.endswith('.bz2'):
+ args.append('-j')
+ elif into.endswith('.gz'):
+ args.append('-z')
+ envs['GZIP'] = '-n'
+ else:
+ raise PathException('unknown compression type %s' % into)
+ subprocess.check_call(args, env=envs)
+
+
+class GitHubCommitTsCache(object):
+ __cachef = 'github.commit.ts.cache'
+ __cachen = 2048
+
+ def __init__(self):
+ Path.mkdir_all(TMPDIR_DL)
+ self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
+ self.cache = {}
+
+ def get(self, k):
+ """Get timestamp with key ``k``."""
+ fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
+ with os.fdopen(fileno) as fin:
+ try:
+ fcntl.lockf(fileno, fcntl.LOCK_SH)
+ self._cache_init(fin)
+ if k in self.cache:
+ ts = self.cache[k][0]
+ return ts
+ finally:
+ fcntl.lockf(fileno, fcntl.LOCK_UN)
+ return None
+
+ def set(self, k, v):
+ """Update timestamp with ``k``."""
+ fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
+ with os.fdopen(fileno, 'wb+') as f:
+ try:
+ fcntl.lockf(fileno, fcntl.LOCK_EX)
+ self._cache_init(f)
+ self.cache[k] = (v, int(time.time()))
+ self._cache_flush(f)
+ finally:
+ fcntl.lockf(fileno, fcntl.LOCK_UN)
+
+ def _cache_init(self, fin):
+ for line in fin:
+ k, ts, updated = line.split()
+ ts = int(ts)
+ updated = int(updated)
+ self.cache[k] = (ts, updated)
+
+ def _cache_flush(self, fout):
+ cache = sorted(self.cache.iteritems(), cmp=lambda a, b: b[1][1] - a[1][1])
+ cache = cache[:self.__cachen]
+ self.cache = {}
+ os.ftruncate(fout.fileno(), 0)
+ fout.seek(0, os.SEEK_SET)
+ for k, ent in cache:
+ ts = ent[0]
+ updated = ent[1]
+ line = '{0} {1} {2}\n'.format(k, ts, updated)
+ fout.write(line)
+
+
+class DownloadGitHubTarball(object):
+ """Download and repack archive tarabll from GitHub.
+
+ Compared with the method of packing after cloning the whole repo, this
+ method is more friendly to users with fragile internet connection.
+
+ However, there are limitations with this method
+
+ - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
+ This affects fetching commit date for reproducible tarballs. Download
+ through the archive link is not affected.
+
+ - GitHub archives do not contain source codes for submodules.
+
+ - GitHub archives seem to respect .gitattributes and ignore pathes with
+ export-ignore attributes.
+
+ For the first two issues, the method will fail loudly to allow fallback to
+ clone-then-pack method.
+
+ As for the 3rd issue, to make sure that this method only produces identical
+ tarballs as the fallback method, we require the expected hash value to be
+ supplied. That means the first tarball will need to be prepared by the
+ clone-then-pack method
+ """
+
+ __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
+
+ def __init__(self, args):
+ self.dl_dir = args.dl_dir
+ self.version = args.version
+ self.subdir = args.subdir
+ self.source = args.source
+ self.url = args.url
+ self._init_owner_repo()
+ self.xhash = args.hash
+ self._init_hasher()
+ self.commit_ts = None # lazy load commit timestamp
+ self.commit_ts_cache = GitHubCommitTsCache()
+ self.name = 'github-tarball'
+
+ def download(self):
+ """Download and repack GitHub archive tarball."""
+ self._init_commit_ts()
+ with Path(TMPDIR_DL, keep=True) as dir_dl:
+ # fetch tarball from GitHub
+ tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
+ with Path(tarball_path, isdir=False):
+ self._fetch(tarball_path)
+ # unpack
+ d = os.path.join(dir_dl.path, self.subdir + '.untar')
+ with Path(d, preclean=True) as dir_untar:
+ tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
+ dir0 = os.path.join(dir_untar.path, tarball_prefix)
+ dir1 = os.path.join(dir_untar.path, self.subdir)
+ # submodules check
+ if self._has_submodule(dir0):
+ raise self._error('Fetching submodules is not yet supported')
+ # rename subdir
+ os.rename(dir0, dir1)
+ # repack
+ into=os.path.join(TMPDIR_DL, self.source)
+ Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
+ try:
+ self._hash_check(into)
+ except Exception:
+ Path.rm_all(into)
+ raise
+ # move to target location
+ file1 = os.path.join(self.dl_dir, self.source)
+ if into != file1:
+ shutil.move(into, file1)
+
+ def _has_submodule(self, dir_):
+ m = os.path.join(dir_, '.gitmodules')
+ try:
+ st = os.stat(m)
+ return st.st_size > 0
+ except OSError as e:
+ return e.errno != errno.ENOENT
+
+ def _init_owner_repo(self):
+ m = self.__repo_url_regex.search(self.url)
+ if m is None:
+ raise self._error('Invalid github url: {}'.format(self.url))
+ owner = m.group('owner')
+ repo = m.group('repo')
+ if repo.endswith('.git'):
+ repo = repo[:-4]
+ self.owner = owner
+ self.repo = repo
+
+ def _init_hasher(self):
+ xhash = self.xhash
+ if len(xhash) == 64:
+ self.hasher = hashlib.sha256()
+ elif len(xhash) == 32:
+ self.hasher = hashlib.md5()
+ else:
+ raise self._error('Requires sha256sum for verification')
+ self.xhash = xhash
+
+ def _hash_check(self, f):
+ with open(f, 'rb') as fin:
+ while True:
+ d = fin.read(4096)
+ if not d:
+ break
+ self.hasher.update(d)
+ xhash = self.hasher.hexdigest()
+ if xhash != self.xhash:
+ raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
+
+ def _init_commit_ts(self):
+ if self.commit_ts is not None:
+ return
+ # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
+ # terse while API[2] provides more verbose info such as commit diff
+ # etc. That's the main reason why API[1] is preferred: the response
+ # size is predictable.
+ #
+ # However, API[1] only accepts complete commit sha1sum as the parameter
+ # while API[2] is more liberal accepting also partial commit id and
+ # tags, etc.
+ #
+ # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
+ # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
+ apis = [
+ {
+ 'url': self._make_repo_url_path('git', 'commits', self.version),
+ 'attr_path': ('committer', 'date'),
+ }, {
+ 'url': self._make_repo_url_path('commits', self.version),
+ 'attr_path': ('commit', 'committer', 'date'),
+ },
+ ]
+ version_is_sha1sum = len(self.version) == 40
+ if not version_is_sha1sum:
+ apis.insert(0, apis.pop())
+ for api in apis:
+ url = api['url']
+ attr_path = api['attr_path']
+ try:
+ ct = self.commit_ts_cache.get(url)
+ if ct is not None:
+ self.commit_ts = ct
+ return
+ ct = self._init_commit_ts_remote_get(url, attr_path)
+ self.commit_ts = ct
+ self.commit_ts_cache.set(url, ct)
+ return
+ except Exception:
+ pass
+ raise self._error('Cannot fetch commit ts: {}'.format(url))
+
+ def _init_commit_ts_remote_get(self, url, attrpath):
+ resp = self._make_request(url)
+ data = resp.read()
+ date = json.loads(data)
+ for attr in attrpath:
+ date = date[attr]
+ date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
+ date = date.timetuple()
+ ct = calendar.timegm(date)
+ return ct
+
+ def _fetch(self, path):
+ """Fetch tarball of the specified version ref."""
+ ref = self.version
+ url = self._make_repo_url_path('tarball', ref)
+ resp = self._make_request(url)
+ with open(path, 'wb') as fout:
+ while True:
+ d = resp.read(4096)
+ if not d:
+ break
+ fout.write(d)
+
+ def _make_repo_url_path(self, *args):
+ url = '/repos/{0}/{1}'.format(self.owner, self.repo)
+ if args:
+ url += '/' + '/'.join(args)
+ return url
+
+ def _make_request(self, path):
+ """Request GitHub API endpoint on ``path``."""
+ url = 'https://api.github.com' + path
+ headers = {
+ 'Accept': 'application/vnd.github.v3+json',
+ 'User-Agent': 'OpenWrt',
+ }
+ req = urllib2.Request(url, headers=headers)
+ sslcontext = ssl._create_unverified_context()
+ fileobj = urllib2.urlopen(req, context=sslcontext)
+ return fileobj
+
+ def _error(self, msg):
+ return DownloadGitHubError('{}: {}'.format(self.source, msg))
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
+ parser.add_argument('--url', help='Download URL')
+ parser.add_argument('--subdir', help='Source code subdir name')
+ parser.add_argument('--version', help='Source code version')
+ parser.add_argument('--source', help='Source tarball filename')
+ parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
+ args = parser.parse_args()
+ method = DownloadGitHubTarball(args)
+ try:
+ method.download()
+ except Exception:
+ sys.stderr.write('download {} from {} failed\n'.format(args.source, args.url))
+ raise
+
+if __name__ == '__main__':
+ main()