#!/usr/bin/python """ Copyright 2013 Google Inc. Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. Calulate differences between image pairs, and store them in a database. """ # System-level imports import contextlib import errno import json import logging import os import Queue import re import shutil import tempfile import threading import time import urllib # Must fix up PYTHONPATH before importing from within Skia import rs_fixpypath # pylint: disable=W0611 # Imports from within Skia import find_run_binary from py.utils import gs_utils SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff') DEFAULT_IMAGE_SUFFIX = '.png' DEFAULT_IMAGES_SUBDIR = 'images' # TODO(epoger): Figure out a better default number of threads; for now, # using a conservative default value. DEFAULT_NUM_WORKER_THREADS = 1 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]') RGBDIFFS_SUBDIR = 'diffs' WHITEDIFFS_SUBDIR = 'whitediffs' # Keys used within DiffRecord dictionary representations. # NOTE: Keep these in sync with static/constants.js KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel' KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels' KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels' KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference' KEY__DIFFERENCES__DIFF_URL = 'diffUrl' KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl' # Special values within ImageDiffDB._diff_dict _DIFFRECORD_FAILED = 'failed' _DIFFRECORD_PENDING = 'pending' # How often to report tasks_queue size QUEUE_LOGGING_GRANULARITY = 1000 # Temporary variable to keep track of how many times we download # the same file in multiple threads. # TODO(epoger): Delete this, once we see that the number stays close to 0. global_file_collisions = 0 class DiffRecord(object): """ Record of differences between two images. """ def __init__(self, gs, storage_root, expected_image_url, expected_image_locator, actual_image_url, actual_image_locator, expected_images_subdir=DEFAULT_IMAGES_SUBDIR, actual_images_subdir=DEFAULT_IMAGES_SUBDIR, image_suffix=DEFAULT_IMAGE_SUFFIX): """Download this pair of images (unless we already have them on local disk), and prepare a DiffRecord for them. Args: gs: instance of GSUtils object we can use to download images storage_root: root directory on local disk within which we store all images expected_image_url: file, GS, or HTTP url from which we will download the expected image expected_image_locator: a unique ID string under which we will store the expected image within storage_root (probably including a checksum to guarantee uniqueness) actual_image_url: file, GS, or HTTP url from which we will download the actual image actual_image_locator: a unique ID string under which we will store the actual image within storage_root (probably including a checksum to guarantee uniqueness) expected_images_subdir: the subdirectory expected images are stored in. actual_images_subdir: the subdirectory actual images are stored in. image_suffix: the suffix of images. """ expected_image_locator = _sanitize_locator(expected_image_locator) actual_image_locator = _sanitize_locator(actual_image_locator) # Download the expected/actual images, if we don't have them already. expected_image_file = os.path.join( storage_root, expected_images_subdir, str(expected_image_locator) + image_suffix) actual_image_file = os.path.join( storage_root, actual_images_subdir, str(actual_image_locator) + image_suffix) for image_file, image_url in [ (expected_image_file, expected_image_url), (actual_image_file, actual_image_url)]: if image_file and image_url: try: _download_file(gs, image_file, image_url) except Exception: logging.exception('unable to download image_url %s to file %s' % (image_url, image_file)) raise # Return early if we do not need to generate diffs. if (expected_image_url == actual_image_url or not expected_image_url or not actual_image_url): return # Get all diff images and values using the skpdiff binary. skpdiff_output_dir = tempfile.mkdtemp() try: skpdiff_summary_file = os.path.join(skpdiff_output_dir, 'skpdiff-output.json') skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR) skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR) _mkdir_unless_exists(skpdiff_rgbdiff_dir) _mkdir_unless_exists(skpdiff_rgbdiff_dir) # TODO(epoger): Consider calling skpdiff ONCE for all image pairs, # instead of calling it separately for each image pair. # Pro: we'll incur less overhead from making repeated system calls, # spinning up the skpdiff binary, etc. # Con: we would have to wait until all image pairs were loaded before # generating any of the diffs? # Note(stephana): '--longnames' was added to allow for this # case (multiple files at once) versus specifying output diffs # directly. find_run_binary.run_command( [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file, '--jsonp', 'false', '--longnames', 'true', '--output', skpdiff_summary_file, '--differs', 'perceptual', 'different_pixels', '--rgbDiffDir', skpdiff_rgbdiff_dir, '--whiteDiffDir', skpdiff_whitediff_dir, ]) # Get information out of the skpdiff_summary_file. with contextlib.closing(open(skpdiff_summary_file)) as fp: data = json.load(fp) # For now, we can assume there is only one record in the output summary, # since we passed skpdiff only one pair of images. record = data['records'][0] self._width = record['width'] self._height = record['height'] self._diffUrl = os.path.split(record['rgbDiffPath'])[1] self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1] # TODO: make max_diff_per_channel a tuple instead of a list, because the # structure is meaningful (first element is red, second is green, etc.) # See http://stackoverflow.com/a/626871 self._max_diff_per_channel = [ record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']] per_differ_stats = record['diffs'] for stats in per_differ_stats: differ_name = stats['differName'] if differ_name == 'different_pixels': self._num_pixels_differing = stats['pointsOfInterest'] elif differ_name == 'perceptual': perceptual_similarity = stats['result'] # skpdiff returns the perceptual similarity; convert it to get the # perceptual difference percentage. # skpdiff outputs -1 if the images are different sizes. Treat any # output that does not lie in [0, 1] as having 0% perceptual # similarity. if not 0 <= perceptual_similarity <= 1: perceptual_similarity = 0 self._perceptual_difference = 100 - (perceptual_similarity * 100) finally: shutil.rmtree(skpdiff_output_dir) # TODO(epoger): Use properties instead of getters throughout. # See http://stackoverflow.com/a/6618176 def get_num_pixels_differing(self): """Returns the absolute number of pixels that differ.""" return self._num_pixels_differing def get_percent_pixels_differing(self): """Returns the percentage of pixels that differ, as a float between 0 and 100 (inclusive).""" return ((float(self._num_pixels_differing) * 100) / (self._width * self._height)) def get_perceptual_difference(self): """Returns the perceptual difference percentage.""" return self._perceptual_difference def get_max_diff_per_channel(self): """Returns the maximum difference between the expected and actual images for each R/G/B channel, as a list.""" return self._max_diff_per_channel def as_dict(self): """Returns a dictionary representation of this DiffRecord, as needed when constructing the JSON representation.""" return { KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing, KEY__DIFFERENCES__PERCENT_DIFF_PIXELS: self.get_percent_pixels_differing(), KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel, KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference, KEY__DIFFERENCES__DIFF_URL: self._diffUrl, KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl, } class ImageDiffDB(object): """ Calculates differences between image pairs, maintaining a database of them for download.""" def __init__(self, storage_root, gs=None, num_worker_threads=DEFAULT_NUM_WORKER_THREADS): """ Args: storage_root: string; root path within the DB will store all of its stuff gs: instance of GSUtils object we can use to download images num_worker_threads: how many threads that download images and generate diffs simultaneously """ self._storage_root = storage_root self._gs = gs # Mechanism for reporting queue size periodically. self._last_queue_size_reported = None self._queue_size_report_lock = threading.RLock() # Dictionary of DiffRecords, keyed by (expected_image_locator, # actual_image_locator) tuples. # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED. # # Any thread that modifies _diff_dict must first acquire # _diff_dict_writelock! # # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably # remove items from self._diff_dict if they haven't been accessed for a # long time. We can always regenerate them by diffing the images we # previously downloaded to local disk. # I guess we should figure out how expensive it is to download vs diff the # image pairs... if diffing them is expensive too, we can write these # _diff_dict objects out to disk if there's too many to hold in RAM. # Or we could use virtual memory to handle that automatically. self._diff_dict = {} self._diff_dict_writelock = threading.RLock() # Set up the queue for asynchronously loading DiffRecords, and start the # worker threads reading from it. # The queue maxsize must be 0 (infinite size queue), so that asynchronous # calls can return as soon as possible. self._tasks_queue = Queue.Queue(maxsize=0) self._workers = [] for i in range(num_worker_threads): worker = threading.Thread(target=self.worker, args=(i,)) worker.daemon = True worker.start() self._workers.append(worker) def log_queue_size_if_changed(self, limit_verbosity=True): """Log the size of self._tasks_queue, if it has changed since the last call. Reports the current queue size, using log.info(), unless the queue is the same size as the last time we reported it. Args: limit_verbosity: if True, only log if the queue size is a multiple of QUEUE_LOGGING_GRANULARITY """ # Acquire the lock, to synchronize access to self._last_queue_size_reported self._queue_size_report_lock.acquire() try: size = self._tasks_queue.qsize() if size == self._last_queue_size_reported: return if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0): return logging.info('tasks_queue size is %d' % size) self._last_queue_size_reported = size finally: self._queue_size_report_lock.release() def worker(self, worker_num): """Launch a worker thread that pulls tasks off self._tasks_queue. Args: worker_num: (integer) which worker this is """ while True: self.log_queue_size_if_changed() params = self._tasks_queue.get() key, expected_image_url, actual_image_url = params try: diff_record = DiffRecord( self._gs, self._storage_root, expected_image_url=expected_image_url, expected_image_locator=key[0], actual_image_url=actual_image_url, actual_image_locator=key[1]) except Exception: logging.exception( 'exception while creating DiffRecord for key %s' % str(key)) diff_record = _DIFFRECORD_FAILED self._diff_dict_writelock.acquire() try: self._diff_dict[key] = diff_record finally: self._diff_dict_writelock.release() @property def storage_root(self): return self._storage_root def add_image_pair(self, expected_image_url, expected_image_locator, actual_image_url, actual_image_locator): """Asynchronously prepare a DiffRecord for a pair of images. This method will return quickly; calls to get_diff_record() will block until the DiffRecord is available (or we have given up on creating it). If we already have a DiffRecord for this particular image pair, no work will be done. If expected_image_url (or its locator) is None, just download actual_image. If actual_image_url (or its locator) is None, just download expected_image. Args: expected_image_url: file, GS, or HTTP url from which we will download the expected image expected_image_locator: a unique ID string under which we will store the expected image within storage_root (probably including a checksum to guarantee uniqueness) actual_image_url: file, GS, or HTTP url from which we will download the actual image actual_image_locator: a unique ID string under which we will store the actual image within storage_root (probably including a checksum to guarantee uniqueness) """ expected_image_locator = _sanitize_locator(expected_image_locator) actual_image_locator = _sanitize_locator(actual_image_locator) key = (expected_image_locator, actual_image_locator) must_add_to_queue = False self._diff_dict_writelock.acquire() try: if not key in self._diff_dict: # If we have already requested a diff between these two images, # we don't need to request it again. must_add_to_queue = True self._diff_dict[key] = _DIFFRECORD_PENDING finally: self._diff_dict_writelock.release() if must_add_to_queue: self._tasks_queue.put((key, expected_image_url, actual_image_url)) self.log_queue_size_if_changed() def get_diff_record(self, expected_image_locator, actual_image_locator): """Returns the DiffRecord for this image pair. This call will block until the diff record is available, or we were unable to generate it. Args: expected_image_locator: a unique ID string under which we will store the expected image within storage_root (probably including a checksum to guarantee uniqueness) actual_image_locator: a unique ID string under which we will store the actual image within storage_root (probably including a checksum to guarantee uniqueness) Returns the DiffRecord for this image pair, or None if we were unable to generate one. """ key = (_sanitize_locator(expected_image_locator), _sanitize_locator(actual_image_locator)) diff_record = self._diff_dict[key] # If we have no results yet, block until we do. while diff_record == _DIFFRECORD_PENDING: time.sleep(1) diff_record = self._diff_dict[key] # Once we have the result... if diff_record == _DIFFRECORD_FAILED: logging.error( 'failed to create a DiffRecord for expected_image_locator=%s , ' 'actual_image_locator=%s' % ( expected_image_locator, actual_image_locator)) return None else: return diff_record # Utility functions def _download_file(gs, local_filepath, url): """Download a file from url to local_filepath, unless it is already there. Args: gs: instance of GSUtils object, in case the url points at Google Storage local_filepath: path on local disk where the image should be stored url: HTTP or GS URL from which we can download the image if we don't have it yet """ global global_file_collisions if not os.path.exists(local_filepath): _mkdir_unless_exists(os.path.dirname(local_filepath)) # First download the file contents into a unique filename, and # then rename that file. That way, if multiple threads are downloading # the same filename at the same time, they won't interfere with each # other (they will both download the file, and one will "win" in the end) temp_filename = '%s-%d' % (local_filepath, threading.current_thread().ident) if gs_utils.GSUtils.is_gs_url(url): (bucket, path) = gs_utils.GSUtils.split_gs_url(url) gs.download_file(source_bucket=bucket, source_path=path, dest_path=temp_filename) else: with contextlib.closing(urllib.urlopen(url)) as url_handle: with open(temp_filename, 'wb') as file_handle: shutil.copyfileobj(fsrc=url_handle, fdst=file_handle) # Rename the file to its real filename. # Keep count of how many colliding downloads we encounter; # if it's a large number, we may want to change our download strategy # to minimize repeated downloads. if os.path.exists(local_filepath): global_file_collisions += 1 else: os.rename(temp_filename, local_filepath) def _mkdir_unless_exists(path): """Unless path refers to an already-existing directory, create it. Args: path: path on local disk """ try: os.makedirs(path) except OSError as e: if e.errno == errno.EEXIST: pass def _sanitize_locator(locator): """Returns a sanitized version of a locator (one in which we know none of the characters will have special meaning in filenames). Args: locator: string, or something that can be represented as a string. If None or '', it is returned without modification, because empty locators have a particular meaning ("there is no image for this") """ if locator: return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator)) else: return locator