aboutsummaryrefslogtreecommitdiffhomepage
path: root/tools/run_tests/python_utils/jobset.py
blob: 08d652ae3f37e22ccd88c949629ce74f8238f82a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
# Copyright 2015 gRPC authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Run a group of subprocesses and then finish."""

from __future__ import print_function

import logging
import multiprocessing
import os
import platform
import re
import signal
import subprocess
import sys
import tempfile
import time
import errno


# cpu cost measurement
measure_cpu_costs = False


_DEFAULT_MAX_JOBS = 16 * multiprocessing.cpu_count()
_MAX_RESULT_SIZE = 8192


# NOTE: If you change this, please make sure to test reviewing the
# github PR with http://reviewable.io, which is known to add UTF-8
# characters to the PR description, which leak into the environment here
# and cause failures.
def strip_non_ascii_chars(s):
  return ''.join(c for c in s if ord(c) < 128)


def sanitized_environment(env):
  sanitized = {}
  for key, value in env.items():
    sanitized[strip_non_ascii_chars(key)] = strip_non_ascii_chars(value)
  return sanitized


def platform_string():
  if platform.system() == 'Windows':
    return 'windows'
  elif platform.system()[:7] == 'MSYS_NT':
    return 'windows'
  elif platform.system() == 'Darwin':
    return 'mac'
  elif platform.system() == 'Linux':
    return 'linux'
  else:
    return 'posix'


# setup a signal handler so that signal.pause registers 'something'
# when a child finishes
# not using futures and threading to avoid a dependency on subprocess32
if platform_string() == 'windows':
  pass
else:
  have_alarm = False
  def alarm_handler(unused_signum, unused_frame):
    global have_alarm
    have_alarm = False

  signal.signal(signal.SIGCHLD, lambda unused_signum, unused_frame: None)
  signal.signal(signal.SIGALRM, alarm_handler)


_SUCCESS = object()
_FAILURE = object()
_RUNNING = object()
_KILLED = object()


_COLORS = {
    'red': [ 31, 0 ],
    'green': [ 32, 0 ],
    'yellow': [ 33, 0 ],
    'lightgray': [ 37, 0],
    'gray': [ 30, 1 ],
    'purple': [ 35, 0 ],
    'cyan': [ 36, 0 ]
    }


_BEGINNING_OF_LINE = '\x1b[0G'
_CLEAR_LINE = '\x1b[2K'


_TAG_COLOR = {
    'FAILED': 'red',
    'FLAKE': 'purple',
    'TIMEOUT_FLAKE': 'purple',
    'WARNING': 'yellow',
    'TIMEOUT': 'red',
    'PASSED': 'green',
    'START': 'gray',
    'WAITING': 'yellow',
    'SUCCESS': 'green',
    'IDLE': 'gray',
    'SKIPPED': 'cyan'
    }

_FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.INFO, format=_FORMAT)


def eintr_be_gone(fn):
  """Run fn until it doesn't stop because of EINTR"""
  while True:
    try:
      return fn()
    except IOError, e:
      if e.errno != errno.EINTR:
        raise



def message(tag, msg, explanatory_text=None, do_newline=False):
  if message.old_tag == tag and message.old_msg == msg and not explanatory_text:
    return
  message.old_tag = tag
  message.old_msg = msg
  while True:
    try:
      if platform_string() == 'windows' or not sys.stdout.isatty():
        if explanatory_text:
          logging.info(explanatory_text)
        logging.info('%s: %s', tag, msg)
      else:
        sys.stdout.write('%s%s%s\x1b[%d;%dm%s\x1b[0m: %s%s' % (
            _BEGINNING_OF_LINE,
            _CLEAR_LINE,
            '\n%s' % explanatory_text if explanatory_text is not None else '',
            _COLORS[_TAG_COLOR[tag]][1],
            _COLORS[_TAG_COLOR[tag]][0],
            tag,
            msg,
            '\n' if do_newline or explanatory_text is not None else ''))
      sys.stdout.flush()
      return
    except IOError, e:
      if e.errno != errno.EINTR:
        raise

message.old_tag = ''
message.old_msg = ''

def which(filename):
  if '/' in filename:
    return filename
  for path in os.environ['PATH'].split(os.pathsep):
    if os.path.exists(os.path.join(path, filename)):
      return os.path.join(path, filename)
  raise Exception('%s not found' % filename)


class JobSpec(object):
  """Specifies what to run for a job."""

  def __init__(self, cmdline, shortname=None, environ=None,
               cwd=None, shell=False, timeout_seconds=5*60, flake_retries=0,
               timeout_retries=0, kill_handler=None, cpu_cost=1.0,
               verbose_success=False):
    """
    Arguments:
      cmdline: a list of arguments to pass as the command line
      environ: a dictionary of environment variables to set in the child process
      kill_handler: a handler that will be called whenever job.kill() is invoked
      cpu_cost: number of cores per second this job needs
    """
    if environ is None:
      environ = {}
    self.cmdline = cmdline
    self.environ = environ
    self.shortname = cmdline[0] if shortname is None else shortname
    self.cwd = cwd
    self.shell = shell
    self.timeout_seconds = timeout_seconds
    self.flake_retries = flake_retries
    self.timeout_retries = timeout_retries
    self.kill_handler = kill_handler
    self.cpu_cost = cpu_cost
    self.verbose_success = verbose_success

  def identity(self):
    return '%r %r' % (self.cmdline, self.environ)

  def __hash__(self):
    return hash(self.identity())

  def __cmp__(self, other):
    return self.identity() == other.identity()

  def __repr__(self):
    return 'JobSpec(shortname=%s, cmdline=%s)' % (self.shortname, self.cmdline)

  def __str__(self):
    return '%s: %s %s' % (self.shortname,
                          ' '.join('%s=%s' % kv for kv in self.environ.items()),
                          ' '.join(self.cmdline))


class JobResult(object):
  def __init__(self):
    self.state = 'UNKNOWN'
    self.returncode = -1
    self.elapsed_time = 0
    self.num_failures = 0
    self.retries = 0
    self.message = ''
    self.cpu_estimated = 1
    self.cpu_measured = 0


def read_from_start(f):
  f.seek(0)
  return f.read()


class Job(object):
  """Manages one job."""

  def __init__(self, spec, newline_on_success, travis, add_env,
               quiet_success=False):
    self._spec = spec
    self._newline_on_success = newline_on_success
    self._travis = travis
    self._add_env = add_env.copy()
    self._retries = 0
    self._timeout_retries = 0
    self._suppress_failure_message = False
    self._quiet_success = quiet_success
    if not self._quiet_success:
      message('START', spec.shortname, do_newline=self._travis)
    self.result = JobResult()
    self.start()

  def GetSpec(self):
    return self._spec

  def start(self):
    self._tempfile = tempfile.TemporaryFile()
    env = dict(os.environ)
    env.update(self._spec.environ)
    env.update(self._add_env)
    env = sanitized_environment(env)
    self._start = time.time()
    cmdline = self._spec.cmdline
    # The Unix time command is finicky when used with MSBuild, so we don't use it
    # with jobs that run MSBuild.
    global measure_cpu_costs
    if measure_cpu_costs and not 'vsprojects\\build' in cmdline[0]:
      cmdline = ['time', '-p'] + cmdline
    else:
      measure_cpu_costs = False
    try_start = lambda: subprocess.Popen(args=cmdline,
                                         stderr=subprocess.STDOUT,
                                         stdout=self._tempfile,
                                         cwd=self._spec.cwd,
                                         shell=self._spec.shell,
                                         env=env)
    delay = 0.3
    for i in range(0, 4):
      try:
        self._process = try_start()
        break
      except OSError:
        message('WARNING', 'Failed to start %s, retrying in %f seconds' % (self._spec.shortname, delay))
        time.sleep(delay)
        delay *= 2
    else:
      self._process = try_start()
    self._state = _RUNNING

  def state(self):
    """Poll current state of the job. Prints messages at completion."""
    def stdout(self=self):
      stdout = read_from_start(self._tempfile)
      self.result.message = stdout[-_MAX_RESULT_SIZE:]
      return stdout
    if self._state == _RUNNING and self._process.poll() is not None:
      elapsed = time.time() - self._start
      self.result.elapsed_time = elapsed
      if self._process.returncode != 0:
        if self._retries < self._spec.flake_retries:
          message('FLAKE', '%s [ret=%d, pid=%d]' % (
            self._spec.shortname, self._process.returncode, self._process.pid),
            stdout(), do_newline=True)
          self._retries += 1
          self.result.num_failures += 1
          self.result.retries = self._timeout_retries + self._retries
          self.start()
        else:
          self._state = _FAILURE
          if not self._suppress_failure_message:
            message('FAILED', '%s [ret=%d, pid=%d]' % (
                self._spec.shortname, self._process.returncode, self._process.pid),
                stdout(), do_newline=True)
          self.result.state = 'FAILED'
          self.result.num_failures += 1
          self.result.returncode = self._process.returncode
      else:
        self._state = _SUCCESS
        measurement = ''
        if measure_cpu_costs:
          m = re.search(r'real\s+([0-9.]+)\nuser\s+([0-9.]+)\nsys\s+([0-9.]+)', stdout())
          real = float(m.group(1))
          user = float(m.group(2))
          sys = float(m.group(3))
          if real > 0.5:
            cores = (user + sys) / real
            self.result.cpu_measured = float('%.01f' % cores)
            self.result.cpu_estimated = float('%.01f' % self._spec.cpu_cost)
            measurement = '; cpu_cost=%.01f; estimated=%.01f' % (self.result.cpu_measured, self.result.cpu_estimated)
        if not self._quiet_success:
          message('PASSED', '%s [time=%.1fsec; retries=%d:%d%s]' % (
              self._spec.shortname, elapsed, self._retries, self._timeout_retries, measurement),
              stdout() if self._spec.verbose_success else None,
              do_newline=self._newline_on_success or self._travis)
        self.result.state = 'PASSED'
    elif (self._state == _RUNNING and
          self._spec.timeout_seconds is not None and
          time.time() - self._start > self._spec.timeout_seconds):
      if self._timeout_retries < self._spec.timeout_retries:
        message('TIMEOUT_FLAKE', '%s [pid=%d]' % (self._spec.shortname, self._process.pid), stdout(), do_newline=True)
        self._timeout_retries += 1
        self.result.num_failures += 1
        self.result.retries = self._timeout_retries + self._retries
        if self._spec.kill_handler:
          self._spec.kill_handler(self)
        self._process.terminate()
        self.start()
      else:
        message('TIMEOUT', '%s [pid=%d]' % (self._spec.shortname, self._process.pid), stdout(), do_newline=True)
        self.kill()
        self.result.state = 'TIMEOUT'
        self.result.num_failures += 1
    return self._state

  def kill(self):
    if self._state == _RUNNING:
      self._state = _KILLED
      if self._spec.kill_handler:
        self._spec.kill_handler(self)
      self._process.terminate()

  def suppress_failure_message(self):
    self._suppress_failure_message = True


class Jobset(object):
  """Manages one run of jobs."""

  def __init__(self, check_cancelled, maxjobs, newline_on_success, travis,
               stop_on_failure, add_env, quiet_success, max_time, clear_alarms):
    self._running = set()
    self._check_cancelled = check_cancelled
    self._clear_alarms = clear_alarms
    self._cancelled = False
    self._failures = 0
    self._completed = 0
    self._maxjobs = maxjobs
    self._newline_on_success = newline_on_success
    self._travis = travis
    self._stop_on_failure = stop_on_failure
    self._add_env = add_env
    self._quiet_success = quiet_success
    self._max_time = max_time
    self.resultset = {}
    self._remaining = None
    self._start_time = time.time()

  def set_remaining(self, remaining):
    self._remaining = remaining

  def get_num_failures(self):
    return self._failures

  def cpu_cost(self):
    c = 0
    for job in self._running:
      c += job._spec.cpu_cost
    return c

  def start(self, spec):
    """Start a job. Return True on success, False on failure."""
    while True:
      if self._max_time > 0 and time.time() - self._start_time > self._max_time:
        skipped_job_result = JobResult()
        skipped_job_result.state = 'SKIPPED'
        message('SKIPPED', spec.shortname, do_newline=True)
        self.resultset[spec.shortname] = [skipped_job_result]
        return True
      if self.cancelled(): return False
      current_cpu_cost = self.cpu_cost()
      if current_cpu_cost == 0: break
      if current_cpu_cost + spec.cpu_cost <= self._maxjobs: break
      self.reap()
    if self.cancelled(): return False
    job = Job(spec,
              self._newline_on_success,
              self._travis,
              self._add_env,
              self._quiet_success)
    self._running.add(job)
    if job.GetSpec().shortname not in self.resultset:
      self.resultset[job.GetSpec().shortname] = []
    return True

  def reap(self):
    """Collect the dead jobs."""
    while self._running:
      dead = set()
      for job in self._running:
        st = eintr_be_gone(lambda: job.state())
        if st == _RUNNING: continue
        if st == _FAILURE or st == _KILLED:
          self._failures += 1
          if self._stop_on_failure:
            self._cancelled = True
            for job in self._running:
              job.kill()
        dead.add(job)
        break
      for job in dead:
        self._completed += 1
        if not self._quiet_success or job.result.state != 'PASSED':
          self.resultset[job.GetSpec().shortname].append(job.result)
        self._running.remove(job)
      if dead: return
      if not self._travis and platform_string() != 'windows':
        rstr = '' if self._remaining is None else '%d queued, ' % self._remaining
        if self._remaining is not None and self._completed > 0:
          now = time.time()
          sofar = now - self._start_time
          remaining = sofar / self._completed * (self._remaining + len(self._running))
          rstr = 'ETA %.1f sec; %s' % (remaining, rstr)
        message('WAITING', '%s%d jobs running, %d complete, %d failed' % (
            rstr, len(self._running), self._completed, self._failures))
      if platform_string() == 'windows':
        time.sleep(0.1)
      else:
        global have_alarm
        if not have_alarm:
          have_alarm = True
          signal.alarm(10)
        signal.pause()

  def cancelled(self):
    """Poll for cancellation."""
    if self._cancelled: return True
    if not self._check_cancelled(): return False
    for job in self._running:
      job.kill()
    self._cancelled = True
    return True

  def finish(self):
    while self._running:
      if self.cancelled(): pass  # poll cancellation
      self.reap()
    # Clear the alarms when finished to avoid a race condition causing job
    # failures. Don't do this when running multi-VM tests because clearing
    # the alarms causes the test to stall
    if platform_string() != 'windows' and self._clear_alarms:
      signal.alarm(0)
    return not self.cancelled() and self._failures == 0


def _never_cancelled():
  return False


def tag_remaining(xs):
  staging = []
  for x in xs:
    staging.append(x)
    if len(staging) > 5000:
      yield (staging.pop(0), None)
  n = len(staging)
  for i, x in enumerate(staging):
    yield (x, n - i - 1)


def run(cmdlines,
        check_cancelled=_never_cancelled,
        maxjobs=None,
        newline_on_success=False,
        travis=False,
        infinite_runs=False,
        stop_on_failure=False,
        add_env={},
        skip_jobs=False,
        quiet_success=False,
        max_time=-1,
        clear_alarms=True):
  if skip_jobs:
    resultset = {}
    skipped_job_result = JobResult()
    skipped_job_result.state = 'SKIPPED'
    for job in cmdlines:
      message('SKIPPED', job.shortname, do_newline=True)
      resultset[job.shortname] = [skipped_job_result]
    return 0, resultset
  js = Jobset(check_cancelled,
              maxjobs if maxjobs is not None else _DEFAULT_MAX_JOBS,
              newline_on_success, travis, stop_on_failure, add_env,
              quiet_success, max_time, clear_alarms)
  for cmdline, remaining in tag_remaining(cmdlines):
    if not js.start(cmdline):
      break
    if remaining is not None:
      js.set_remaining(remaining)
  js.finish()
  return js.get_num_failures(), js.resultset