From 69468cbd5b7ec2bd59faab70d10e1bbe619f5a98 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sun, 21 Oct 2018 10:45:08 +0100
Subject: [PATCH] Refactor dependency builder scope checks

Make them common to all commands, and clarify the docs about scoping
in general.
---
 README.md                       | 26 ++++++++++++++++++-
 gitlab_docker_autodep/deps.py   | 44 ++++++++++++++++++++++++++-------
 gitlab_docker_autodep/main.py   | 36 ++++++++++++++++++---------
 gitlab_docker_autodep/server.py |  9 ++++---
 4 files changed, 90 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 64db078..533f6aa 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ command-line option,
 
 You can pass an authentication token using the *--token* command-line
 option. This is usually required in order to trigger CI pipelines: the
-token must have sufficient permissions to do so.
+access token must have the *api* scope.
 
 The tool will only examine Docker images hosted on the Docker registry
 associated with the Gitlab instance. By default the registry name is
@@ -54,6 +54,30 @@ automatically derived from the server URL (adding a *registry*
 prefix), but it can be changed with the *--registry* command-line
 option.
 
+### Scope
+
+On larger Gitlab instances, parsing Dockerfiles for all projects can
+be an expensive (long) operation. The program offers two options to
+manage the scope of the dependency analysis: *--match* and *--filter*.
+
+The former, *--match*, allows for filtering the project list on the
+server side, using a Gitlab search query. The latter, *--filter*,
+applies a regular expression to the project names (including
+namespaces) before parsing their dependencies. Combining the two, for
+example, it is possible to efficiently limit the scope of the tool to
+a specific namespace:
+
+```
+gitlab-docker-autodep ... --match myns --filter ^myns/ ...
+```
+
+Note that, when building the dependency tree:
+
+* tags in FROM lines are ignored
+* only the *master* branch of repositories is scanned for Dockerfiles
+
+This might lead to more rebuilds than strictly necessary.
+
 ## Command-line
 
 The `rebuild` command will trigger a rebuild of all the dependencies
diff --git a/gitlab_docker_autodep/deps.py b/gitlab_docker_autodep/deps.py
index 81fc7ac..12f0f25 100644
--- a/gitlab_docker_autodep/deps.py
+++ b/gitlab_docker_autodep/deps.py
@@ -1,16 +1,20 @@
 import gitlab
 import logging
+import re
 
 
+_from_rx = re.compile(r'^FROM\s+(.*)$')
+
 def _parse_dockerfile(df):
     for line in df.split('\n'):
-        if line.startswith('FROM '):
-            return line[5:].strip()
+        m = _from_rx.match(line)
+        if m:
+            return m.group(1)
 
 
-def _fetch_dockerfile(gl, project):
+def _fetch_dockerfile(gl, project, ref):
     try:
-        f = project.files.get(file_path='Dockerfile', ref='master')
+        f = project.files.get(file_path='Dockerfile', ref=ref)
         return f.decode()
     except:
         return None
@@ -22,13 +26,34 @@ def _remove_image_tag(name):
     return name
 
 
-def build_dependency_tree(gl, search_pattern=None):
-    """Build the project dependency map based on Dockerfiles."""
+def build_dependency_tree(gl, search_pattern=None, filter_pattern=None):
+    """Build the project dependency map based on Dockerfiles.
+
+    This can be a fairly expensive (long) operation if the list of
+    projects is large. The 'search_pattern' argument allows for
+    filtering on the server side, using Gitlab search query syntax.
+    On the client side, the project list can be filtered with a
+    regular expression using the 'filter_pattern' argument, which will
+    be applied to the project's path_with_namespace.
+
+    Returns an {image_name: [projects]}, where 'projects' is the list
+    of projects that have 'image_name' as their base Docker
+    image. These are gitlab.Project instances.
+
+    We only examine Dockerfiles in the master branch of repositories.
+
+    """
     deps = {}
 
+    filter_rx = None
+    if filter_pattern:
+        filter_rx = re.compile(filter_pattern)
+
     projects = gl.projects.list(all=True, search=search_pattern, as_list=False)
     for project in projects:
-        df = _fetch_dockerfile(gl, project)
+        if filter_rx is not None and not filter_rx.search(project.path_with_namespace):
+            continue
+        df = _fetch_dockerfile(gl, project, 'master')
         if not df:
             continue
         base_image = _parse_dockerfile(df)
@@ -41,6 +66,7 @@ def build_dependency_tree(gl, search_pattern=None):
 
 
 def rebuild(project, wait=False):
+    """Trigger a rebuild of a project."""
     pipeline = project.pipelines.create({'ref': 'master'})
     if wait:
         while pipeline.finished_at is None:
@@ -50,7 +76,7 @@ def rebuild(project, wait=False):
 
 
 def rebuild_deps(gitlab_url, registry_hostname, gitlab_token,
-                 search_pattern, image_name,
+                 search_pattern, filter_pattern, image_name,
                  dry_run=False, recurse=False, wait=False):
     """Rebuild dependencies of the given image."""
     gl = gitlab.Gitlab(gitlab_url, private_token=gitlab_token)
@@ -63,7 +89,7 @@ def rebuild_deps(gitlab_url, registry_hostname, gitlab_token,
     while stack:
         project = stack.pop(0)
 
-        print 'rebuilding %s' % project.path_with_namespace
+        logging.info('rebuilding %s', project.path_with_namespace)
         if not dry_run:
             pipeline = rebuild(project, wait)
             if pipeline.status != 'success':
diff --git a/gitlab_docker_autodep/main.py b/gitlab_docker_autodep/main.py
index 34d40b3..6a22dcb 100644
--- a/gitlab_docker_autodep/main.py
+++ b/gitlab_docker_autodep/main.py
@@ -14,17 +14,30 @@ def main():
     subparsers = parser.add_subparsers(dest='subparser')
 
     # Common options.
-    parser.add_argument('--token', metavar='TOKEN',
-                        help='Gitlab authentication token')
-    parser.add_argument('--registry', metavar='NAME',
-                        help='Docker registry hostname (if empty, it will be '
-                        'automatically derived from --url)')
-    parser.add_argument('--url', metavar='URL', help='Gitlab URL')
-    parser.add_argument('--debug', action='store_true')
+    common_parser = argparse.ArgumentParser(add_help=False)
+    gitlab_opts_group = common_parser.add_argument_group('gitlab options')
+    gitlab_opts_group.add_argument(
+        '--url', metavar='URL', help='Gitlab URL')
+    gitlab_opts_group.add_argument(
+        '--token', metavar='TOKEN',
+        help='Gitlab authentication token')
+    gitlab_opts_group.add_argument(
+        '--registry', metavar='NAME',
+        help='Docker registry hostname (if empty, it will be '
+        'automatically derived from --url)')
+    scope_opts_group = common_parser.add_argument_group('project scope options')
+    scope_opts_group.add_argument(
+        '--match',
+        help='Search query to filter project list on the server side')
+    scope_opts_group.add_argument(
+        '--filter',
+        help='Regexp to filter project list on the client side')
+    common_parser.add_argument('--debug', action='store_true')
 
     # Rebuild deps.
     rebuild_image_parser = subparsers.add_parser(
         'rebuild',
+        parents=[common_parser],
         help='rebuild dependencies of an image',
         description='Rebuild all projects that depend on the specified '
         'Docker image.')
@@ -35,9 +48,6 @@ def main():
         '--recurse', action='store_true',
         help='Include all dependencies recursively '
         'and wait for completion of the pipelines')
-    rebuild_image_parser.add_argument(
-        '--match',
-        help='Search keyword(s) to filter project list')
     rebuild_image_parser.add_argument(
         'image_name',
         help='Docker image name')
@@ -45,6 +55,7 @@ def main():
     # Server.
     server_parser = subparsers.add_parser(
         'server',
+        parents=[common_parser],
         help='start a HTTP server',
         description='Start a HTTP server that listens for Gitlab webhooks. '
         'Configure Gitlab to send Pipeline events for your projects to this '
@@ -82,7 +93,8 @@ def main():
             registry_hostname,
             args.token,
             args.match,
-            args[0],
+            args.filter,
+            args.image_name,
             args.dry_run,
             args.recurse,
             args.recurse,
@@ -92,6 +104,8 @@ def main():
             args.url,
             registry_hostname,
             args.token,
+            args.match,
+            args.filter,
             args.bind_host,
             args.bind_port,
             args.webhook_auth_token,
diff --git a/gitlab_docker_autodep/server.py b/gitlab_docker_autodep/server.py
index 4272b60..1273f88 100644
--- a/gitlab_docker_autodep/server.py
+++ b/gitlab_docker_autodep/server.py
@@ -26,13 +26,13 @@ class _DepsCache(object):
         with self._deps_lock:
             return self._deps.get(image_name, [])
 
-    def update_thread(self):
+    def update_thread(self, search_pattern, filter_pattern):
         loaded = False
         while True:
             try:
                 if not loaded:
                     app.logger.info('scanning project dependencies...')
-                new_deps = build_dependency_tree(app.gl)
+                new_deps = build_dependency_tree(app.gl, search_pattern, filter_pattern)
                 with self._deps_lock:
                     self._deps = new_deps
                 if not loaded:
@@ -81,8 +81,8 @@ def worker_thread():
 
 
 def run_app(gitlab_url, registry_hostname, gitlab_token,
-            bind_host, bind_port, webhook_token,
-            num_workers=2):
+            search_pattern, filter_pattern, bind_host, bind_port,
+            webhook_token, num_workers=2):
     app.config.update({
         'REGISTRY_HOSTNAME': registry_hostname,
         'WEBHOOK_AUTH_TOKEN': webhook_token,
@@ -96,6 +96,7 @@ def run_app(gitlab_url, registry_hostname, gitlab_token,
     # dependency map (an expensive operation).
     update_t = threading.Thread(
         target=deps_cache.update_thread,
+        args=(search_pattern, filter_pattern),
         name='Dependency Update Thread')
     update_t.setDaemon(True)
     update_t.start()
-- 
GitLab