# Copyright 2017 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Classes for wrapping a model to operate on different data shapes.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import abc from tensorflow.contrib.timeseries.python.timeseries import feature_keys from tensorflow.contrib.timeseries.python.timeseries import math_utils from tensorflow.contrib.timeseries.python.timeseries.model import ModelOutputs from tensorflow.python.estimator import estimator_lib from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.util import nest class PassthroughStateManager(object): """A minimal wrapper for models which do not need state management.""" def __init__(self): self._input_statistics = None self._graph_initialized = False def initialize_graph(self, model, input_statistics=None): """Adds required operations to the graph.""" del model # unused self._graph_initialized = True self._input_statistics = input_statistics def define_loss(self, model, features, mode): """Wrap "model" with StateManager-specific operations. Args: model: The model (inheriting from TimeSeriesModel) to manage state for. features: A dictionary with the following key/value pairs: feature_keys.TrainEvalFeatures.TIMES: A [batch size x window size] Tensor with times for each observation. feature_keys.TrainEvalFeatures.VALUES: A [batch size x window size x num features] Tensor with values for each observation. mode: The tf.estimator.ModeKeys mode to use (TRAIN or EVAL). Returns: A ModelOutputs object. Raises: ValueError: If start state was specified. """ if feature_keys.State.STATE_TUPLE in features: raise ValueError( "Overriding start state is not supported for this model.") return model.define_loss(features, mode) class _OverridableStateManager(PassthroughStateManager): """Base class for state managers which support overriding model state.""" @abc.abstractmethod def _define_loss_with_saved_state(self, model, features, mode): pass def define_loss(self, model, features, mode): """Switches between explicit start state and managed state.""" if feature_keys.FilteringFeatures.STATE_TUPLE in features: # Explicit start state has been provided, so we should use that. if mode == estimator_lib.ModeKeys.TRAIN: raise ValueError( "Overriding saved state for training is not supported (but a value " "for feature {} was specified).".format( feature_keys.FilteringFeatures.STATE_TUPLE)) start_state = features[feature_keys.FilteringFeatures.STATE_TUPLE] del features[feature_keys.FilteringFeatures.STATE_TUPLE] return model.get_batch_loss( features=features, mode=mode, state=start_state) else: # No explicit start state; use managed state. return self._define_loss_with_saved_state( model=model, features=features, mode=mode) class FilteringOnlyStateManager(_OverridableStateManager): """State manager for models which use state only for filtering. Window-based models (ARModel) do not require state to be fed during training (instead requiring a specific window size). Rather than requiring a minimum window size for filtering, these models maintain this window in their state, and so need state to be fed. """ def _define_loss_with_saved_state(self, model, features, mode): return model.define_loss(features, mode) class ChainingStateManager(_OverridableStateManager): """Maintains state across a batch for SequentialTimeSeriesModel subclasses. The batch dimension is treated as indexing sequential chunks of the same timeseries. End state from each chunk is fed as start state to the next chunk during the next timestep. This is an approximation to full-batch training for sequential models, but is typically much faster while still accurately recovering parameters. The speedup comes from reduced scheduling overhead of TensorFlow ops, since each operation can do much more work. """ def __init__(self, state_saving_interval=20, checkpoint_state=False): """Initialize the state manager. Args: state_saving_interval: This state manager saves intermediate model state every `state_saving_interval` times. Larger values save memory, and checkpoint size if `checkpoint_state` is enabled, but models will need to impute across artificial gaps of up to this size (i.e. gaps not appearing in the original data). This imputation may affect training. Set state_saving_interval to 1 to avoid any artificial imputation. checkpoint_state: If True, saved intermediate model state will be written to checkpoints. Checkpoints will then scale with dataset size. If False, state will be freshly imputed from the beginning of a series each time the model is restored, which means it may take a few iterations for state to warm up. """ super(ChainingStateManager, self).__init__() self._checkpoint_state = checkpoint_state self._state_saving_interval = state_saving_interval self._start_state = None self._cached_states = None def initialize_graph(self, model, input_statistics=None): """Adds required operations to the graph.""" super(ChainingStateManager, self).initialize_graph( model=model, input_statistics=input_statistics) self._start_state = model.get_start_state() self._cached_states = math_utils.TupleOfTensorsLookup( key_dtype=dtypes.int64, default_values=self._start_state, empty_key=-1, name="cached_states", checkpoint=self._checkpoint_state) def _define_loss_with_saved_state(self, model, features, mode): """Feeds end state from one training iteration into the next. Args: model: The model to wrap. Compatible with children of TimeSeriesModel. features: Dictionary with Tensor values defining the data to be processed. The expected key/value pairs are at minimum: feature_keys.TrainEvalFeatures.TIMES: A [number of chunks x window size] Tensor with times for each observation, the result of chunking a single longer time series. feature_keys.TrainEvalFeatures.VALUES: A [number of chunks x window size x num features] Tensor with values for each observation, corresponding to times. mode: The tf.estimator.ModeKeys mode to use. For EVAL and INFER, no batching is performed, which may be slow. This is to avoid giving cached and almost certainly stale values. Returns: A ModelOutputs object. Raises: ValueError: If initialize_graph has not been called. """ if not self._graph_initialized: raise ValueError("ChainingStateManager requires initialize_graph() to be " "called before use.") (loss_op, end_state, batch_predictions) = self._update_cached_states( model=model, features=features, mode=mode) # Add a batch dimension so state can be used directly (e.g. for predictions) # without the user manually reshaping it. last_end_state_flat = [end_state_value[-1][None] for end_state_value in nest.flatten(end_state)] batch_predictions["observed"] = features[ feature_keys.TrainEvalFeatures.VALUES] return ModelOutputs( loss=loss_op, end_state=nest.pack_sequence_as(end_state, last_end_state_flat), predictions=batch_predictions, prediction_times=features[feature_keys.TrainEvalFeatures.TIMES]) def _get_chunk_number(self, time): return time // self._state_saving_interval def _get_cached_states(self, times): """Retrieve cached states for a batch of times.""" read_chunk_numbers = self._get_chunk_number(times) looked_up_state = list(self._cached_states.lookup( math_ops.cast(read_chunk_numbers, dtypes.int64))) looked_up_state = tuple(looked_up_state) # We need to special-case the first chunk in a series to explicitly rely on # the model's starting state so that gradients flow back to it. Otherwise it # would affect only initialization, and would not be read from or updated # during training. Not doing this also isolates that part of the graph, # leading to errors on model reload if there are trainable variables # affecting a model's start state. if self._input_statistics is not None: start_time = self._input_statistics.start_time else: start_time = 0 set_to_start_state = math_ops.equal(read_chunk_numbers, self._get_chunk_number(start_time)) new_states = [] for start_state_value, cache_variable in zip( nest.flatten( math_utils.replicate_state(self._start_state, array_ops.shape(times)[0])), nest.flatten(looked_up_state)): new_states.append( array_ops.where(set_to_start_state, start_state_value, cache_variable)) looked_up_state = nest.pack_sequence_as(looked_up_state, new_states) return looked_up_state def _update_cached_states(self, model, features, mode): """Read, process, and write chunks to the cache.""" times = features[feature_keys.TrainEvalFeatures.TIMES] looked_up_state = self._get_cached_states(times[:, 0]) (model_loss, intermediate_states, batch_predictions) = model.per_step_batch_loss( features=features, mode=mode, state=looked_up_state) # We need to at least write to the bucket after the one we read from. min_chunk_numbers = self._get_chunk_number(times) + 1 # We write to the bucket that would have been read had the window started at # the next sample (except for the last sample in the window, which gets # written to the next bucket). This assumes fixed missing times (i.e. if we # were presented with times [10, 50] we will never see times [30, 50]). # # TODO(allenl): Retrieve the highest time less than the current time rather # than relying on fixed bucketing. write_chunk_numbers = math_ops.maximum( self._get_chunk_number(array_ops.concat( [times[:, 1:], times[:, -1:] + 1], axis=1)), min_chunk_numbers) # Write once for every computed state; this may mean that we write multiple # times to the same cell, but later writes will take precedence. save_ops = [ self._cached_states.insert( keys=write_chunk_numbers, values=intermediate_states)] end_state = nest.pack_sequence_as( intermediate_states, [state_element[:, -1] for state_element in nest.flatten(intermediate_states)]) with ops.control_dependencies(save_ops): # Make sure end states get saved at each iteration loss_op = array_ops.identity(model_loss) return loss_op, end_state, batch_predictions