From 6aa9930801bc73c3c848c5ef405a9f0a3b7227a0 Mon Sep 17 00:00:00 2001
From: Jan Tattermusch <jtattermusch@google.com>
Date: Wed, 17 Oct 2018 19:11:40 +0200
Subject: better comments for client_matrix.py

---
 tools/interop_matrix/client_matrix.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools/interop_matrix')
diff --git a/tools/interop_matrix/client_matrix.py b/tools/interop_matrix/client_matrix.py
index 15b53d1716..911c351305 100644
--- a/tools/interop_matrix/client_matrix.py
+++ b/tools/interop_matrix/client_matrix.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Dictionaries used for client matrix testing.
+# Defines languages, runtimes and releases for backward compatibility testing
 
 
 def get_github_repo(lang):
@@ -53,8 +53,7 @@ LANG_RUNTIME_MATRIX = {
     'csharp': ['csharp', 'csharpcoreclr'],
 }
 
-# Dictionary of releases per language.  For each language, we need to provide
-# a release tag pointing to the latest build of the branch.
+# Dictionary of known releases for given language.
 LANG_RELEASE_MATRIX = {
     'cxx': [
         {
-- 
cgit v1.2.3


From 7773f6cc6e60be81348371553d25475e2c55f48f Mon Sep 17 00:00:00 2001
From: Jan Tattermusch <jtattermusch@google.com>
Date: Thu, 18 Oct 2018 18:13:03 +0200
Subject: cleanup of run_interop_matrix_tests.py

---
 tools/interop_matrix/run_interop_matrix_tests.py | 216 ++++++++++++-----------
 1 file changed, 113 insertions(+), 103 deletions(-)

(limited to 'tools/interop_matrix')

diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py
index 6cd6f43167..d5388a22f7 100755
--- a/tools/interop_matrix/run_interop_matrix_tests.py
+++ b/tools/interop_matrix/run_interop_matrix_tests.py
@@ -26,7 +26,7 @@ import subprocess
 import sys
 import uuid
 
-# Langauage Runtime Matrix
+# Language Runtime Matrix
 import client_matrix
 
 python_util_dir = os.path.abspath(
@@ -37,6 +37,8 @@ import jobset
 import report_utils
 import upload_test_results
 
+_TEST_TIMEOUT_SECONDS = 60
+_PULL_IMAGE_TIMEOUT_SECONDS = 10 * 60
 _LANGUAGES = client_matrix.LANG_RUNTIME_MATRIX.keys()
 # All gRPC release tags, flattened, deduped and sorted.
 _RELEASES = sorted(
@@ -45,7 +47,6 @@ _RELEASES = sorted(
             client_matrix.get_release_tag_name(info)
             for lang in client_matrix.LANG_RELEASE_MATRIX.values()
             for info in lang)))
-_TEST_TIMEOUT = 60
 
 argp = argparse.ArgumentParser(description='Run interop tests.')
 argp.add_argument('-j', '--jobs', default=multiprocessing.cpu_count(), type=int)
@@ -56,7 +57,7 @@ argp.add_argument(
 argp.add_argument(
     '--release',
     default='all',
-    choices=['all', 'master'] + _RELEASES,
+    choices=['all'] + _RELEASES,
     help='Release tags to test.  When testing all '
     'releases defined in client_matrix.py, use "all".')
 argp.add_argument(
@@ -92,136 +93,149 @@ argp.add_argument(
     nargs='?',
     help='The gateway to backend services.')
 
-args = argp.parse_args()
-
-print(str(args))
 
-
-def find_all_images_for_lang(lang):
+def _get_test_images_for_lang(lang, release_arg, image_path_prefix):
     """Find docker images for a language across releases and runtimes.
 
   Returns dictionary of list of (<tag>, <image-full-path>) keyed by runtime.
   """
-    # Find all defined releases.
-    if args.release == 'all':
-        releases = ['master'] + client_matrix.get_release_tags(lang)
+    if release_arg == 'all':
+        # Use all defined releases for given language
+        releases = client_matrix.get_release_tags(lang)
     else:
         # Look for a particular release.
-        if args.release not in ['master'
-                               ] + client_matrix.get_release_tags(lang):
+        if release_arg not in client_matrix.get_release_tags(lang):
             jobset.message(
                 'SKIPPED',
-                '%s for %s is not defined' % (args.release, lang),
+                'release %s for %s is not defined' % (release_arg, lang),
                 do_newline=True)
             return {}
-        releases = [args.release]
+        releases = [release_arg]
 
-    # TODO(jtattermusch): why do we need to query the existing images/tags?
-    # From LANG_RUNTIME_MATRIX and LANG_RELEASE_MATRIX it should be obvious
-    # which tags we want to test - and it should be an error if they are
-    # missing.
     # Images tuples keyed by runtime.
     images = {}
     for runtime in client_matrix.LANG_RUNTIME_MATRIX[lang]:
-        image_path = '%s/grpc_interop_%s' % (args.gcr_path, runtime)
-        output = subprocess.check_output([
-            'gcloud', 'beta', 'container', 'images', 'list-tags',
-            '--format=json', image_path
-        ])
-        docker_image_list = json.loads(output)
-        # All images should have a single tag or no tag.
-        # TODO(adelez): Remove tagless images.
-        tags = [i['tags'][0] for i in docker_image_list if i['tags']]
-        jobset.message(
-            'START',
-            'Found images for %s: %s' % (image_path, tags),
-            do_newline=True)
-        skipped = len(docker_image_list) - len(tags)
-        jobset.message(
-            'SKIPPED',
-            'Skipped images (no-tag/unknown-tag): %d' % skipped,
-            do_newline=True)
-        # Filter tags based on the releases.
-        images[runtime] = [(tag, '%s:%s' % (image_path, tag))
-                           for tag in tags
-                           if tag in releases]
+        image_path = '%s/grpc_interop_%s' % (image_path_prefix, runtime)
+        images[runtime] = [
+            (tag, '%s:%s' % (image_path, tag)) for tag in releases
+        ]
     return images
 
 
-# caches test cases (list of JobSpec) loaded from file.  Keyed by lang and runtime.
-def find_test_cases(lang, runtime, release, suite_name):
-    """Returns the list of test cases from testcase files per lang/release."""
+def _read_test_cases_file(lang, runtime, release):
+    """Read test cases from a bash-like file and return a list of commands"""
     testcase_dir = os.path.join(os.path.dirname(__file__), 'testcases')
     filename_prefix = lang
     if lang == 'csharp':
+        # TODO(jtattermusch): remove this odd specialcase
         filename_prefix = runtime
     # Check to see if we need to use a particular version of test cases.
     lang_version = '%s_%s' % (filename_prefix, release)
     if lang_version in client_matrix.TESTCASES_VERSION_MATRIX:
-        testcases = os.path.join(
+        testcase_file = os.path.join(
             testcase_dir, client_matrix.TESTCASES_VERSION_MATRIX[lang_version])
     else:
-        testcases = os.path.join(testcase_dir, '%s__master' % filename_prefix)
+        # TODO(jtattermusch): remove the double-underscore, it is pointless
+        testcase_file = os.path.join(testcase_dir,
+                                     '%s__master' % filename_prefix)
+
+    lines = []
+    with open(testcase_file) as f:
+        for line in f.readlines():
+            line = re.sub('\\#.*$', '', line)  # remove hash comments
+            line = line.strip()
+            if line and not line.startswith('echo'):
+                # Each non-empty line is a treated as a test case command
+                lines.append(line)
+    return lines
+
+
+def _cleanup_docker_image(image):
+    jobset.message('START', 'Cleanup docker image %s' % image, do_newline=True)
+    dockerjob.remove_image(image, skip_nonexistent=True)
+
+
+args = argp.parse_args()
+
+
+# caches test cases (list of JobSpec) loaded from file.  Keyed by lang and runtime.
+def _generate_test_case_jobspecs(lang, runtime, release, suite_name):
+    """Returns the list of test cases from testcase files per lang/release."""
+    testcase_lines = _read_test_cases_file(lang, runtime, release)
 
     job_spec_list = []
-    try:
-        with open(testcases) as f:
-            # Only line start with 'docker run' are test cases.
-            for line in f.readlines():
-                if line.startswith('docker run'):
-                    m = re.search('--test_case=(.*)"', line)
-                    shortname = m.group(1) if m else 'unknown_test'
-                    m = re.search(
-                        '--server_host_override=(.*).sandbox.googleapis.com',
-                        line)
-                    server = m.group(1) if m else 'unknown_server'
-
-                    # If server_host arg is not None, replace the original
-                    # server_host with the one provided or append to the end of
-                    # the command if server_host does not appear originally.
-                    if args.server_host:
-                        if line.find('--server_host=') > -1:
-                            line = re.sub('--server_host=[^ ]*',
-                                          '--server_host=%s' % args.server_host,
-                                          line)
-                        else:
-                            line = '%s --server_host=%s"' % (line[:-1],
-                                                             args.server_host)
-                        print(line)
-
-                    spec = jobset.JobSpec(
-                        cmdline=line,
-                        shortname='%s:%s:%s:%s' % (suite_name, lang, server,
-                                                   shortname),
-                        timeout_seconds=_TEST_TIMEOUT,
-                        shell=True,
-                        flake_retries=5 if args.allow_flakes else 0)
-                    job_spec_list.append(spec)
-            jobset.message(
-                'START',
-                'Loaded %s tests from %s' % (len(job_spec_list), testcases),
-                do_newline=True)
-    except IOError as err:
-        jobset.message('FAILED', err, do_newline=True)
+    for line in testcase_lines:
+        m = re.search('--test_case=(.*)"', line)
+        shortname = m.group(1) if m else 'unknown_test'
+        m = re.search('--server_host_override=(.*).sandbox.googleapis.com',
+                      line)
+        server = m.group(1) if m else 'unknown_server'
+
+        # If server_host arg is not None, replace the original
+        # server_host with the one provided or append to the end of
+        # the command if server_host does not appear originally.
+        if args.server_host:
+            if line.find('--server_host=') > -1:
+                line = re.sub('--server_host=[^ ]*',
+                              '--server_host=%s' % args.server_host, line)
+            else:
+                line = '%s --server_host=%s"' % (line[:-1], args.server_host)
+
+        spec = jobset.JobSpec(
+            cmdline=line,
+            shortname='%s:%s:%s:%s' % (suite_name, lang, server, shortname),
+            timeout_seconds=_TEST_TIMEOUT_SECONDS,
+            shell=True,
+            flake_retries=5 if args.allow_flakes else 0)
+        job_spec_list.append(spec)
     return job_spec_list
 
 
-_xml_report_tree = report_utils.new_junit_xml_tree()
+def _pull_images_for_lang(lang, images):
+    """Pull all images for given lang from container registry."""
+    jobset.message(
+        'START', 'Downloading images for language "%s"' % lang, do_newline=True)
+    download_specs = []
+    for release, image in images:
+        # Pull the image and warm it up.
+        # First time we use an image with "docker run", it takes time to unpack the image
+        # and later this delay would fail our test cases.
+        cmdline = [
+            'gcloud docker -- pull %s && docker run --rm=true %s /bin/true' %
+            (image, image)
+        ]
+        spec = jobset.JobSpec(
+            cmdline=cmdline,
+            shortname='pull_image_%s' % (image),
+            timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS,
+            shell=True)
+        download_specs.append(spec)
+    num_failures, resultset = jobset.run(
+        download_specs, newline_on_success=True, maxjobs=args.jobs)
+    if num_failures:
+        jobset.message(
+            'FAILED', 'Failed to download some images', do_newline=True)
+        return False
+    else:
+        jobset.message(
+            'SUCCESS', 'All images downloaded successfully.', do_newline=True)
+        return True
 
 
-def run_tests_for_lang(lang, runtime, images):
+def _run_tests_for_lang(lang, runtime, images, xml_report_tree):
     """Find and run all test cases for a language.
 
   images is a list of (<release-tag>, <image-full-path>) tuple.
   """
+    # Fine to ignore return value as failure to download will result in test failure
+    # later anyway.
+    _pull_images_for_lang(lang, images)
+
     total_num_failures = 0
-    for image_tuple in images:
-        release, image = image_tuple
-        jobset.message('START', 'Testing %s' % image, do_newline=True)
-        # Download the docker image before running each test case.
-        subprocess.check_call(['gcloud', 'docker', '--', 'pull', image])
+    for release, image in images:
         suite_name = '%s__%s_%s' % (lang, runtime, release)
-        job_spec_list = find_test_cases(lang, runtime, release, suite_name)
+        job_spec_list = _generate_test_case_jobspecs(lang, runtime, release,
+                                                     suite_name)
 
         if not job_spec_list:
             jobset.message(
@@ -242,28 +256,24 @@ def run_tests_for_lang(lang, runtime, images):
         else:
             jobset.message('SUCCESS', 'All tests passed', do_newline=True)
 
-        report_utils.append_junit_xml_results(_xml_report_tree, resultset,
+        report_utils.append_junit_xml_results(xml_report_tree, resultset,
                                               'grpc_interop_matrix', suite_name,
                                               str(uuid.uuid4()))
 
         if not args.keep:
-            cleanup(image)
+            _cleanup_docker_image(image)
 
     return total_num_failures
 
 
-def cleanup(image):
-    jobset.message('START', 'Cleanup docker image %s' % image, do_newline=True)
-    dockerjob.remove_image(image, skip_nonexistent=True)
-
-
 languages = args.language if args.language != ['all'] else _LANGUAGES
 total_num_failures = 0
+_xml_report_tree = report_utils.new_junit_xml_tree()
 for lang in languages:
-    docker_images = find_all_images_for_lang(lang)
+    docker_images = _get_test_images_for_lang(lang, args.release, args.gcr_path)
     for runtime in sorted(docker_images.keys()):
-        total_num_failures += run_tests_for_lang(lang, runtime,
-                                                 docker_images[runtime])
+        total_num_failures += _run_tests_for_lang(
+            lang, runtime, docker_images[runtime], _xml_report_tree)
 
 report_utils.create_xml_report_file(_xml_report_tree, args.report_file)
 
-- 
cgit v1.2.3


From 32495f4ceecf4dbabaf86b20701a7301638da9f6 Mon Sep 17 00:00:00 2001
From: Jan Tattermusch <jtattermusch@google.com>
Date: Tue, 23 Oct 2018 16:18:39 +0200
Subject: address review comments

---
 tools/internal_ci/linux/grpc_interop_matrix.sh   | 2 +-
 tools/interop_matrix/run_interop_matrix_tests.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools/interop_matrix')

diff --git a/tools/internal_ci/linux/grpc_interop_matrix.sh b/tools/internal_ci/linux/grpc_interop_matrix.sh
index fea365eeff..a5220ea087 100755
--- a/tools/internal_ci/linux/grpc_interop_matrix.sh
+++ b/tools/internal_ci/linux/grpc_interop_matrix.sh
@@ -22,4 +22,4 @@ cd $(dirname $0)/../../..
 
 source tools/internal_ci/helper_scripts/prepare_build_linux_rc
 
-tools/interop_matrix/run_interop_matrix_tests.py $RUN_TESTS_FLAGS
\ No newline at end of file
+tools/interop_matrix/run_interop_matrix_tests.py $RUN_TESTS_FLAGS
diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py
index d5388a22f7..be9edc0b9b 100755
--- a/tools/interop_matrix/run_interop_matrix_tests.py
+++ b/tools/interop_matrix/run_interop_matrix_tests.py
@@ -165,6 +165,8 @@ def _generate_test_case_jobspecs(lang, runtime, release, suite_name):
 
     job_spec_list = []
     for line in testcase_lines:
+        # TODO(jtattermusch): revisit the logic for updating test case commands
+        # what it currently being done seems fragile.
         m = re.search('--test_case=(.*)"', line)
         shortname = m.group(1) if m else 'unknown_test'
         m = re.search('--server_host_override=(.*).sandbox.googleapis.com',
@@ -198,8 +200,8 @@ def _pull_images_for_lang(lang, images):
     download_specs = []
     for release, image in images:
         # Pull the image and warm it up.
-        # First time we use an image with "docker run", it takes time to unpack the image
-        # and later this delay would fail our test cases.
+        # First time we use an image with "docker run", it takes time to unpack
+        # the image and later this delay would fail our test cases.
         cmdline = [
             'gcloud docker -- pull %s && docker run --rm=true %s /bin/true' %
             (image, image)
-- 
cgit v1.2.3


From 02be5c002e58c056fb5c38974113ee5bee169ee4 Mon Sep 17 00:00:00 2001
From: Jan Tattermusch <jtattermusch@google.com>
Date: Thu, 25 Oct 2018 13:21:06 +0200
Subject: limit download parallelism

---
 tools/interop_matrix/run_interop_matrix_tests.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'tools/interop_matrix')

diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py
index be9edc0b9b..6cf2a9b036 100755
--- a/tools/interop_matrix/run_interop_matrix_tests.py
+++ b/tools/interop_matrix/run_interop_matrix_tests.py
@@ -38,7 +38,8 @@ import report_utils
 import upload_test_results
 
 _TEST_TIMEOUT_SECONDS = 60
-_PULL_IMAGE_TIMEOUT_SECONDS = 10 * 60
+_PULL_IMAGE_TIMEOUT_SECONDS = 15 * 60
+_MAX_PARALLEL_DOWNLOADS = 6
 _LANGUAGES = client_matrix.LANG_RUNTIME_MATRIX.keys()
 # All gRPC release tags, flattened, deduped and sorted.
 _RELEASES = sorted(
@@ -203,8 +204,8 @@ def _pull_images_for_lang(lang, images):
         # First time we use an image with "docker run", it takes time to unpack
         # the image and later this delay would fail our test cases.
         cmdline = [
-            'gcloud docker -- pull %s && docker run --rm=true %s /bin/true' %
-            (image, image)
+            'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true'
+            % (image, image)
         ]
         spec = jobset.JobSpec(
             cmdline=cmdline,
@@ -212,8 +213,10 @@ def _pull_images_for_lang(lang, images):
             timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS,
             shell=True)
         download_specs.append(spec)
+    # too many image downloads at once tend to get stuck
+    max_pull_jobs = min(args.jobs, _MAX_PARALLEL_DOWNLOADS)
     num_failures, resultset = jobset.run(
-        download_specs, newline_on_success=True, maxjobs=args.jobs)
+        download_specs, newline_on_success=True, maxjobs=max_pull_jobs)
     if num_failures:
         jobset.message(
             'FAILED', 'Failed to download some images', do_newline=True)
@@ -229,9 +232,10 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree):
 
   images is a list of (<release-tag>, <image-full-path>) tuple.
   """
-    # Fine to ignore return value as failure to download will result in test failure
-    # later anyway.
-    _pull_images_for_lang(lang, images)
+    if not _pull_images_for_lang(lang, images):
+        jobset.message(
+            'FAILED', 'Image download failed. Exiting.', do_newline=True)
+        return 1
 
     total_num_failures = 0
     for release, image in images:
-- 
cgit v1.2.3