From 6aa9930801bc73c3c848c5ef405a9f0a3b7227a0 Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Wed, 17 Oct 2018 19:11:40 +0200 Subject: better comments for client_matrix.py --- tools/interop_matrix/client_matrix.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools/interop_matrix') diff --git a/tools/interop_matrix/client_matrix.py b/tools/interop_matrix/client_matrix.py index 15b53d1716..911c351305 100644 --- a/tools/interop_matrix/client_matrix.py +++ b/tools/interop_matrix/client_matrix.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Dictionaries used for client matrix testing. +# Defines languages, runtimes and releases for backward compatibility testing def get_github_repo(lang): @@ -53,8 +53,7 @@ LANG_RUNTIME_MATRIX = { 'csharp': ['csharp', 'csharpcoreclr'], } -# Dictionary of releases per language. For each language, we need to provide -# a release tag pointing to the latest build of the branch. +# Dictionary of known releases for given language. LANG_RELEASE_MATRIX = { 'cxx': [ { -- cgit v1.2.3 From 7773f6cc6e60be81348371553d25475e2c55f48f Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Thu, 18 Oct 2018 18:13:03 +0200 Subject: cleanup of run_interop_matrix_tests.py --- tools/interop_matrix/run_interop_matrix_tests.py | 216 ++++++++++++----------- 1 file changed, 113 insertions(+), 103 deletions(-) (limited to 'tools/interop_matrix') diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py index 6cd6f43167..d5388a22f7 100755 --- a/tools/interop_matrix/run_interop_matrix_tests.py +++ b/tools/interop_matrix/run_interop_matrix_tests.py @@ -26,7 +26,7 @@ import subprocess import sys import uuid -# Langauage Runtime Matrix +# Language Runtime Matrix import client_matrix python_util_dir = os.path.abspath( @@ -37,6 +37,8 @@ import jobset import report_utils import upload_test_results +_TEST_TIMEOUT_SECONDS = 60 +_PULL_IMAGE_TIMEOUT_SECONDS = 10 * 60 _LANGUAGES = client_matrix.LANG_RUNTIME_MATRIX.keys() # All gRPC release tags, flattened, deduped and sorted. _RELEASES = sorted( @@ -45,7 +47,6 @@ _RELEASES = sorted( client_matrix.get_release_tag_name(info) for lang in client_matrix.LANG_RELEASE_MATRIX.values() for info in lang))) -_TEST_TIMEOUT = 60 argp = argparse.ArgumentParser(description='Run interop tests.') argp.add_argument('-j', '--jobs', default=multiprocessing.cpu_count(), type=int) @@ -56,7 +57,7 @@ argp.add_argument( argp.add_argument( '--release', default='all', - choices=['all', 'master'] + _RELEASES, + choices=['all'] + _RELEASES, help='Release tags to test. When testing all ' 'releases defined in client_matrix.py, use "all".') argp.add_argument( @@ -92,136 +93,149 @@ argp.add_argument( nargs='?', help='The gateway to backend services.') -args = argp.parse_args() - -print(str(args)) - -def find_all_images_for_lang(lang): +def _get_test_images_for_lang(lang, release_arg, image_path_prefix): """Find docker images for a language across releases and runtimes. Returns dictionary of list of (, ) keyed by runtime. """ - # Find all defined releases. - if args.release == 'all': - releases = ['master'] + client_matrix.get_release_tags(lang) + if release_arg == 'all': + # Use all defined releases for given language + releases = client_matrix.get_release_tags(lang) else: # Look for a particular release. - if args.release not in ['master' - ] + client_matrix.get_release_tags(lang): + if release_arg not in client_matrix.get_release_tags(lang): jobset.message( 'SKIPPED', - '%s for %s is not defined' % (args.release, lang), + 'release %s for %s is not defined' % (release_arg, lang), do_newline=True) return {} - releases = [args.release] + releases = [release_arg] - # TODO(jtattermusch): why do we need to query the existing images/tags? - # From LANG_RUNTIME_MATRIX and LANG_RELEASE_MATRIX it should be obvious - # which tags we want to test - and it should be an error if they are - # missing. # Images tuples keyed by runtime. images = {} for runtime in client_matrix.LANG_RUNTIME_MATRIX[lang]: - image_path = '%s/grpc_interop_%s' % (args.gcr_path, runtime) - output = subprocess.check_output([ - 'gcloud', 'beta', 'container', 'images', 'list-tags', - '--format=json', image_path - ]) - docker_image_list = json.loads(output) - # All images should have a single tag or no tag. - # TODO(adelez): Remove tagless images. - tags = [i['tags'][0] for i in docker_image_list if i['tags']] - jobset.message( - 'START', - 'Found images for %s: %s' % (image_path, tags), - do_newline=True) - skipped = len(docker_image_list) - len(tags) - jobset.message( - 'SKIPPED', - 'Skipped images (no-tag/unknown-tag): %d' % skipped, - do_newline=True) - # Filter tags based on the releases. - images[runtime] = [(tag, '%s:%s' % (image_path, tag)) - for tag in tags - if tag in releases] + image_path = '%s/grpc_interop_%s' % (image_path_prefix, runtime) + images[runtime] = [ + (tag, '%s:%s' % (image_path, tag)) for tag in releases + ] return images -# caches test cases (list of JobSpec) loaded from file. Keyed by lang and runtime. -def find_test_cases(lang, runtime, release, suite_name): - """Returns the list of test cases from testcase files per lang/release.""" +def _read_test_cases_file(lang, runtime, release): + """Read test cases from a bash-like file and return a list of commands""" testcase_dir = os.path.join(os.path.dirname(__file__), 'testcases') filename_prefix = lang if lang == 'csharp': + # TODO(jtattermusch): remove this odd specialcase filename_prefix = runtime # Check to see if we need to use a particular version of test cases. lang_version = '%s_%s' % (filename_prefix, release) if lang_version in client_matrix.TESTCASES_VERSION_MATRIX: - testcases = os.path.join( + testcase_file = os.path.join( testcase_dir, client_matrix.TESTCASES_VERSION_MATRIX[lang_version]) else: - testcases = os.path.join(testcase_dir, '%s__master' % filename_prefix) + # TODO(jtattermusch): remove the double-underscore, it is pointless + testcase_file = os.path.join(testcase_dir, + '%s__master' % filename_prefix) + + lines = [] + with open(testcase_file) as f: + for line in f.readlines(): + line = re.sub('\\#.*$', '', line) # remove hash comments + line = line.strip() + if line and not line.startswith('echo'): + # Each non-empty line is a treated as a test case command + lines.append(line) + return lines + + +def _cleanup_docker_image(image): + jobset.message('START', 'Cleanup docker image %s' % image, do_newline=True) + dockerjob.remove_image(image, skip_nonexistent=True) + + +args = argp.parse_args() + + +# caches test cases (list of JobSpec) loaded from file. Keyed by lang and runtime. +def _generate_test_case_jobspecs(lang, runtime, release, suite_name): + """Returns the list of test cases from testcase files per lang/release.""" + testcase_lines = _read_test_cases_file(lang, runtime, release) job_spec_list = [] - try: - with open(testcases) as f: - # Only line start with 'docker run' are test cases. - for line in f.readlines(): - if line.startswith('docker run'): - m = re.search('--test_case=(.*)"', line) - shortname = m.group(1) if m else 'unknown_test' - m = re.search( - '--server_host_override=(.*).sandbox.googleapis.com', - line) - server = m.group(1) if m else 'unknown_server' - - # If server_host arg is not None, replace the original - # server_host with the one provided or append to the end of - # the command if server_host does not appear originally. - if args.server_host: - if line.find('--server_host=') > -1: - line = re.sub('--server_host=[^ ]*', - '--server_host=%s' % args.server_host, - line) - else: - line = '%s --server_host=%s"' % (line[:-1], - args.server_host) - print(line) - - spec = jobset.JobSpec( - cmdline=line, - shortname='%s:%s:%s:%s' % (suite_name, lang, server, - shortname), - timeout_seconds=_TEST_TIMEOUT, - shell=True, - flake_retries=5 if args.allow_flakes else 0) - job_spec_list.append(spec) - jobset.message( - 'START', - 'Loaded %s tests from %s' % (len(job_spec_list), testcases), - do_newline=True) - except IOError as err: - jobset.message('FAILED', err, do_newline=True) + for line in testcase_lines: + m = re.search('--test_case=(.*)"', line) + shortname = m.group(1) if m else 'unknown_test' + m = re.search('--server_host_override=(.*).sandbox.googleapis.com', + line) + server = m.group(1) if m else 'unknown_server' + + # If server_host arg is not None, replace the original + # server_host with the one provided or append to the end of + # the command if server_host does not appear originally. + if args.server_host: + if line.find('--server_host=') > -1: + line = re.sub('--server_host=[^ ]*', + '--server_host=%s' % args.server_host, line) + else: + line = '%s --server_host=%s"' % (line[:-1], args.server_host) + + spec = jobset.JobSpec( + cmdline=line, + shortname='%s:%s:%s:%s' % (suite_name, lang, server, shortname), + timeout_seconds=_TEST_TIMEOUT_SECONDS, + shell=True, + flake_retries=5 if args.allow_flakes else 0) + job_spec_list.append(spec) return job_spec_list -_xml_report_tree = report_utils.new_junit_xml_tree() +def _pull_images_for_lang(lang, images): + """Pull all images for given lang from container registry.""" + jobset.message( + 'START', 'Downloading images for language "%s"' % lang, do_newline=True) + download_specs = [] + for release, image in images: + # Pull the image and warm it up. + # First time we use an image with "docker run", it takes time to unpack the image + # and later this delay would fail our test cases. + cmdline = [ + 'gcloud docker -- pull %s && docker run --rm=true %s /bin/true' % + (image, image) + ] + spec = jobset.JobSpec( + cmdline=cmdline, + shortname='pull_image_%s' % (image), + timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS, + shell=True) + download_specs.append(spec) + num_failures, resultset = jobset.run( + download_specs, newline_on_success=True, maxjobs=args.jobs) + if num_failures: + jobset.message( + 'FAILED', 'Failed to download some images', do_newline=True) + return False + else: + jobset.message( + 'SUCCESS', 'All images downloaded successfully.', do_newline=True) + return True -def run_tests_for_lang(lang, runtime, images): +def _run_tests_for_lang(lang, runtime, images, xml_report_tree): """Find and run all test cases for a language. images is a list of (, ) tuple. """ + # Fine to ignore return value as failure to download will result in test failure + # later anyway. + _pull_images_for_lang(lang, images) + total_num_failures = 0 - for image_tuple in images: - release, image = image_tuple - jobset.message('START', 'Testing %s' % image, do_newline=True) - # Download the docker image before running each test case. - subprocess.check_call(['gcloud', 'docker', '--', 'pull', image]) + for release, image in images: suite_name = '%s__%s_%s' % (lang, runtime, release) - job_spec_list = find_test_cases(lang, runtime, release, suite_name) + job_spec_list = _generate_test_case_jobspecs(lang, runtime, release, + suite_name) if not job_spec_list: jobset.message( @@ -242,28 +256,24 @@ def run_tests_for_lang(lang, runtime, images): else: jobset.message('SUCCESS', 'All tests passed', do_newline=True) - report_utils.append_junit_xml_results(_xml_report_tree, resultset, + report_utils.append_junit_xml_results(xml_report_tree, resultset, 'grpc_interop_matrix', suite_name, str(uuid.uuid4())) if not args.keep: - cleanup(image) + _cleanup_docker_image(image) return total_num_failures -def cleanup(image): - jobset.message('START', 'Cleanup docker image %s' % image, do_newline=True) - dockerjob.remove_image(image, skip_nonexistent=True) - - languages = args.language if args.language != ['all'] else _LANGUAGES total_num_failures = 0 +_xml_report_tree = report_utils.new_junit_xml_tree() for lang in languages: - docker_images = find_all_images_for_lang(lang) + docker_images = _get_test_images_for_lang(lang, args.release, args.gcr_path) for runtime in sorted(docker_images.keys()): - total_num_failures += run_tests_for_lang(lang, runtime, - docker_images[runtime]) + total_num_failures += _run_tests_for_lang( + lang, runtime, docker_images[runtime], _xml_report_tree) report_utils.create_xml_report_file(_xml_report_tree, args.report_file) -- cgit v1.2.3 From 32495f4ceecf4dbabaf86b20701a7301638da9f6 Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Tue, 23 Oct 2018 16:18:39 +0200 Subject: address review comments --- tools/internal_ci/linux/grpc_interop_matrix.sh | 2 +- tools/interop_matrix/run_interop_matrix_tests.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'tools/interop_matrix') diff --git a/tools/internal_ci/linux/grpc_interop_matrix.sh b/tools/internal_ci/linux/grpc_interop_matrix.sh index fea365eeff..a5220ea087 100755 --- a/tools/internal_ci/linux/grpc_interop_matrix.sh +++ b/tools/internal_ci/linux/grpc_interop_matrix.sh @@ -22,4 +22,4 @@ cd $(dirname $0)/../../.. source tools/internal_ci/helper_scripts/prepare_build_linux_rc -tools/interop_matrix/run_interop_matrix_tests.py $RUN_TESTS_FLAGS \ No newline at end of file +tools/interop_matrix/run_interop_matrix_tests.py $RUN_TESTS_FLAGS diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py index d5388a22f7..be9edc0b9b 100755 --- a/tools/interop_matrix/run_interop_matrix_tests.py +++ b/tools/interop_matrix/run_interop_matrix_tests.py @@ -165,6 +165,8 @@ def _generate_test_case_jobspecs(lang, runtime, release, suite_name): job_spec_list = [] for line in testcase_lines: + # TODO(jtattermusch): revisit the logic for updating test case commands + # what it currently being done seems fragile. m = re.search('--test_case=(.*)"', line) shortname = m.group(1) if m else 'unknown_test' m = re.search('--server_host_override=(.*).sandbox.googleapis.com', @@ -198,8 +200,8 @@ def _pull_images_for_lang(lang, images): download_specs = [] for release, image in images: # Pull the image and warm it up. - # First time we use an image with "docker run", it takes time to unpack the image - # and later this delay would fail our test cases. + # First time we use an image with "docker run", it takes time to unpack + # the image and later this delay would fail our test cases. cmdline = [ 'gcloud docker -- pull %s && docker run --rm=true %s /bin/true' % (image, image) -- cgit v1.2.3 From 02be5c002e58c056fb5c38974113ee5bee169ee4 Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Thu, 25 Oct 2018 13:21:06 +0200 Subject: limit download parallelism --- tools/interop_matrix/run_interop_matrix_tests.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'tools/interop_matrix') diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py index be9edc0b9b..6cf2a9b036 100755 --- a/tools/interop_matrix/run_interop_matrix_tests.py +++ b/tools/interop_matrix/run_interop_matrix_tests.py @@ -38,7 +38,8 @@ import report_utils import upload_test_results _TEST_TIMEOUT_SECONDS = 60 -_PULL_IMAGE_TIMEOUT_SECONDS = 10 * 60 +_PULL_IMAGE_TIMEOUT_SECONDS = 15 * 60 +_MAX_PARALLEL_DOWNLOADS = 6 _LANGUAGES = client_matrix.LANG_RUNTIME_MATRIX.keys() # All gRPC release tags, flattened, deduped and sorted. _RELEASES = sorted( @@ -203,8 +204,8 @@ def _pull_images_for_lang(lang, images): # First time we use an image with "docker run", it takes time to unpack # the image and later this delay would fail our test cases. cmdline = [ - 'gcloud docker -- pull %s && docker run --rm=true %s /bin/true' % - (image, image) + 'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true' + % (image, image) ] spec = jobset.JobSpec( cmdline=cmdline, @@ -212,8 +213,10 @@ def _pull_images_for_lang(lang, images): timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS, shell=True) download_specs.append(spec) + # too many image downloads at once tend to get stuck + max_pull_jobs = min(args.jobs, _MAX_PARALLEL_DOWNLOADS) num_failures, resultset = jobset.run( - download_specs, newline_on_success=True, maxjobs=args.jobs) + download_specs, newline_on_success=True, maxjobs=max_pull_jobs) if num_failures: jobset.message( 'FAILED', 'Failed to download some images', do_newline=True) @@ -229,9 +232,10 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree): images is a list of (, ) tuple. """ - # Fine to ignore return value as failure to download will result in test failure - # later anyway. - _pull_images_for_lang(lang, images) + if not _pull_images_for_lang(lang, images): + jobset.message( + 'FAILED', 'Image download failed. Exiting.', do_newline=True) + return 1 total_num_failures = 0 for release, image in images: -- cgit v1.2.3