#!/usr/bin/env python3
# Copyright 2020 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
"""Script for collecting dataflow traces using DFSan compiled binary. The script
imitates `CollectDataFlow` function from libFuzzer but provides some flexibility
for skipping long and/or slow corpus elements.

Follow https://github.com/google/oss-fuzz/issues/1632 for more details."""
import hashlib
import os
import subprocess
import sys

# pylint: skip-file

# See https://github.com/google/oss-fuzz/pull/5024#discussion_r561313003 for why
# we are disabling pylint for this file (we can't do it in .pylintrc, probably
# because of weirdness with this file's package, so we do it here).

# These can be controlled by the runner in order to change the values without
# rebuilding OSS-Fuzz base images.
FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024))
MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0))
TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0))

DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0'


def _error(msg):
  sys.stderr.write(msg + '\n')


def _list_dir(dirpath):
  for root, _, files in os.walk(dirpath):
    for f in files:
      yield os.path.join(root, f)


def _sha1(filepath):
  h = hashlib.sha1()
  with open(filepath, 'rb') as f:
    h.update(f.read())
  return h.hexdigest()


def _run(cmd, timeout=None):
  result = None
  try:
    result = subprocess.run(cmd,
                            timeout=timeout,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    if result.returncode:
      _error('{command} finished with non-zero code: {code}'.format(
          command=str(cmd), code=result.returncode))

  except subprocess.TimeoutExpired:
    raise
  except Exception as e:
    _error('Exception: ' + str(e))

  return result


def _timeout(size):
  # Dynamic timeout value (proportional to file size) to discard slow units.
  timeout = MIN_TIMEOUT
  timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT
  return timeout


def collect_traces(binary, corpus_dir, dft_dir):
  stats = {
      'total': 0,
      'traced': 0,
      'long': 0,
      'slow': 0,
      'failed': 0,
  }

  files_and_sizes = {}
  for f in _list_dir(corpus_dir):
    stats['total'] += 1
    size = os.path.getsize(f)
    if size > FILE_SIZE_LIMIT:
      stats['long'] += 1
      print('Skipping large file ({size}b): {path}'.format(size=size, path=f))
      continue
    files_and_sizes[f] = size

  for f in sorted(files_and_sizes, key=files_and_sizes.get):
    output_path = os.path.join(dft_dir, _sha1(f))
    try:
      result = _run([binary, f, output_path], timeout=_timeout(size))
      if result.returncode:
        stats['failed'] += 1
      else:
        stats['traced'] += 1

    except subprocess.TimeoutExpired as e:
      _error('Slow input: ' + str(e))
      stats['slow'] += 1

  return stats


def dump_functions(binary, dft_dir):
  result = _run([binary])
  if not result or result.returncode:
    return False

  with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f:
    f.write(result.stdout)

  return True


def main():
  if len(sys.argv) < 4:
    _error('Usage: {0} <binary> <corpus_dir> <dft_dir>'.format(sys.argv[0]))
    sys.exit(1)

  binary = sys.argv[1]
  corpus_dir = sys.argv[2]
  dft_dir = sys.argv[3]

  os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS

  if not dump_functions(binary, dft_dir):
    _error('Failed to dump functions. Something is wrong.')
    sys.exit(1)

  stats = collect_traces(binary, corpus_dir, dft_dir)
  for k, v in stats.items():
    print('{0}: {1}'.format(k, v))

  # Checksum that we didn't lose track of any of the inputs.
  assert stats['total'] * 2 == sum(v for v in stats.values())
  sys.exit(0)


if __name__ == "__main__":
  main()