gm/rebaseline_server/download_actuals.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257

#!/usr/bin/python

"""
Copyright 2014 Google Inc.

Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file.

Download actual GM results for a particular builder.
"""

# System-level imports
import contextlib
import optparse
import os
import posixpath
import re
import shutil
import sys
import urllib
import urllib2
import urlparse

# Imports from within Skia
#
# We need to add the 'gm' and 'tools' directories, so that we can import
# gm_json.py and buildbot_globals.py.
#
# Make sure that these dirs are in the PYTHONPATH, but add them at the *end*
# so any dirs that are already in the PYTHONPATH will be preferred.
#
# TODO(epoger): Is it OK for this to depend on the 'tools' dir, given that
# the tools dir is dependent on the 'gm' dir (to import gm_json.py)?
TRUNK_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
GM_DIRECTORY = os.path.join(TRUNK_DIRECTORY, 'gm')
TOOLS_DIRECTORY = os.path.join(TRUNK_DIRECTORY, 'tools')
if GM_DIRECTORY not in sys.path:
  sys.path.append(GM_DIRECTORY)
if TOOLS_DIRECTORY not in sys.path:
  sys.path.append(TOOLS_DIRECTORY)
import buildbot_globals
import gm_json

# Imports from third-party code
APICLIENT_DIRECTORY = os.path.join(
    TRUNK_DIRECTORY, 'third_party', 'externals', 'google-api-python-client')
if APICLIENT_DIRECTORY not in sys.path:
  sys.path.append(APICLIENT_DIRECTORY)
from googleapiclient.discovery import build as build_service


GM_SUMMARIES_BUCKET = buildbot_globals.Get('gm_summaries_bucket')
DEFAULT_ACTUALS_BASE_URL = (
    'http://storage.googleapis.com/%s' % GM_SUMMARIES_BUCKET)
DEFAULT_JSON_FILENAME = 'actual-results.json'


class Download(object):

  def __init__(self, actuals_base_url=DEFAULT_ACTUALS_BASE_URL,
               json_filename=DEFAULT_JSON_FILENAME,
               gm_actuals_root_url=gm_json.GM_ACTUALS_ROOT_HTTP_URL):
    """
    Args:
      actuals_base_url: URL pointing at the root directory
          containing all actual-results.json files, e.g.,
          http://domain.name/path/to/dir  OR
          file:///absolute/path/to/localdir
      json_filename: The JSON filename to read from within each directory.
      gm_actuals_root_url: Base URL under which the actually-generated-by-bots
          GM images are stored.
    """
    self._actuals_base_url = actuals_base_url
    self._json_filename = json_filename
    self._gm_actuals_root_url = gm_actuals_root_url
    self._image_filename_re = re.compile(gm_json.IMAGE_FILENAME_PATTERN)

  def fetch(self, builder_name, dest_dir):
    """ Downloads actual GM results for a particular builder.

    Args:
      builder_name: which builder to download results of
      dest_dir: path to directory where the image files will be written;
                if the directory does not exist yet, it will be created

    TODO(epoger): Display progress info.  Right now, it can take a long time
    to download all of the results, and there is no indication of progress.

    TODO(epoger): Download multiple images in parallel to speed things up.
    """
    json_url = posixpath.join(self._actuals_base_url, builder_name,
                              self._json_filename)
    json_contents = urllib2.urlopen(json_url).read()
    results_dict = gm_json.LoadFromString(json_contents)

    actual_results_dict = results_dict[gm_json.JSONKEY_ACTUALRESULTS]
    for result_type in sorted(actual_results_dict.keys()):
      results_of_this_type = actual_results_dict[result_type]
      if not results_of_this_type:
        continue
      for image_name in sorted(results_of_this_type.keys()):
        (test, config) = self._image_filename_re.match(image_name).groups()
        (hash_type, hash_digest) = results_of_this_type[image_name]
        source_url = gm_json.CreateGmActualUrl(
            test_name=test, hash_type=hash_type, hash_digest=hash_digest,
            gm_actuals_root_url=self._gm_actuals_root_url)
        dest_path = os.path.join(dest_dir, config, test + '.png')
        # TODO(epoger): To speed this up, we should only download files that
        # we don't already have on local disk.
        copy_contents(source_url=source_url, dest_path=dest_path,
                      create_subdirs_if_needed=True)


def create_filepath_url(filepath):
  """ Returns a file:/// URL pointing at the given filepath on local disk.

  For now, this is only used by unittests, but I anticipate it being useful
  in production, as a way for developers to run rebaseline_server over locally
  generated images.

  TODO(epoger): Move this function, and copy_contents(), into a shared
  utility module.  They are generally useful.

  Args:
    filepath: string; path to a file on local disk (may be absolute or relative,
        and the file does not need to exist)

  Returns:
    A file:/// URL pointing at the file.  Regardless of whether filepath was
        specified as a relative or absolute path, the URL will contain an
        absolute path to the file.

  Raises:
    An Exception, if filepath is already a URL.
  """
  if urlparse.urlparse(filepath).scheme:
    raise Exception('"%s" is already a URL' % filepath)
  return urlparse.urljoin(
      'file:', urllib.pathname2url(os.path.abspath(filepath)))


def copy_contents(source_url, dest_path, create_subdirs_if_needed=False):
  """ Copies the full contents of the URL 'source_url' into
  filepath 'dest_path'.

  Args:
    source_url: string; complete URL to read from
    dest_path: string; complete filepath to write to (may be absolute or
        relative)
    create_subdirs_if_needed: boolean; whether to create subdirectories as
        needed to create dest_path

  Raises:
    Some subclass of Exception if unable to read source_url or write dest_path.
  """
  if create_subdirs_if_needed:
    dest_dir = os.path.dirname(dest_path)
    if not os.path.exists(dest_dir):
      os.makedirs(dest_dir)
  with contextlib.closing(urllib.urlopen(source_url)) as source_handle:
    with open(dest_path, 'wb') as dest_handle:
      shutil.copyfileobj(fsrc=source_handle, fdst=dest_handle)


def gcs_list_bucket_contents(bucket, subdir=None):
  """ Returns files in the Google Cloud Storage bucket as a (dirs, files) tuple.

  Uses the API documented at
  https://developers.google.com/storage/docs/json_api/v1/objects/list

  Args:
    bucket: name of the Google Storage bucket
    subdir: directory within the bucket to list, or None for root directory
  """
  # The GCS command relies on the subdir name (if any) ending with a slash.
  if subdir and not subdir.endswith('/'):
    subdir += '/'
  subdir_length = len(subdir) if subdir else 0

  storage = build_service('storage', 'v1')
  command = storage.objects().list(
      bucket=bucket, delimiter='/', fields='items(name),prefixes',
      prefix=subdir)
  results = command.execute()

  # The GCS command returned two subdicts:
  # prefixes: the full path of every directory within subdir, with trailing '/'
  # items: property dict for each file object within subdir
  #        (including 'name', which is full path of the object)
  dirs = []
  for dir_fullpath in results.get('prefixes', []):
    dir_basename = dir_fullpath[subdir_length:]
    dirs.append(dir_basename[:-1])  # strip trailing slash
  files = []
  for file_properties in results.get('items', []):
    file_fullpath = file_properties['name']
    file_basename = file_fullpath[subdir_length:]
    files.append(file_basename)
  return (dirs, files)


def main():
  parser = optparse.OptionParser()
  required_params = []
  parser.add_option('--actuals-base-url',
                    action='store', type='string',
                    default=DEFAULT_ACTUALS_BASE_URL,
                    help=('Base URL from which to read files containing JSON '
                          'summaries of actual GM results; defaults to '
                          '"%default".'))
  required_params.append('builder')
  # TODO(epoger): Before https://codereview.chromium.org/309653005 , when this
  # tool downloaded the JSON summaries from skia-autogen, it had the ability
  # to get results as of a specific revision number.  We should add similar
  # functionality when retrieving the summaries from Google Storage.
  parser.add_option('--builder',
                    action='store', type='string',
                    help=('REQUIRED: Which builder to download results for. '
                          'To see a list of builders, run with the '
                          '--list-builders option set.'))
  required_params.append('dest_dir')
  parser.add_option('--dest-dir',
                    action='store', type='string',
                    help=('REQUIRED: Directory where all images should be '
                          'written. If this directory does not exist yet, it '
                          'will be created.'))
  parser.add_option('--json-filename',
                    action='store', type='string',
                    default=DEFAULT_JSON_FILENAME,
                    help=('JSON summary filename to read for each builder; '
                          'defaults to "%default".'))
  parser.add_option('--list-builders', action='store_true',
                    help=('List all available builders.'))
  (params, remaining_args) = parser.parse_args()

  if params.list_builders:
    dirs, _ = gcs_list_bucket_contents(bucket=GM_SUMMARIES_BUCKET)
    print '\n'.join(dirs)
    return

  # Make sure all required options were set,
  # and that there were no items left over in the command line.
  for required_param in required_params:
    if not getattr(params, required_param):
      raise Exception('required option \'%s\' was not set' % required_param)
  if len(remaining_args) is not 0:
    raise Exception('extra items specified in the command line: %s' %
                    remaining_args)

  downloader = Download(actuals_base_url=params.actuals_base_url)
  downloader.fetch(builder_name=params.builder,
                   dest_dir=params.dest_dir)


if __name__ == '__main__':
  main()