diff options
Diffstat (limited to 'tensorflow/compiler/xla/service/cpu/ir_emitter.h')
-rw-r--r-- | tensorflow/compiler/xla/service/cpu/ir_emitter.h | 402 |
1 files changed, 402 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h new file mode 100644 index 0000000000..06415c735d --- /dev/null +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h @@ -0,0 +1,402 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_ + +#include <stddef.h> +#include <map> +#include <string> +#include <unordered_map> +#include <vector> + +#include "external/llvm/include/llvm/ADT/Triple.h" +#include "external/llvm/include/llvm/IR/Function.h" +#include "external/llvm/include/llvm/IR/IRBuilder.h" +#include "external/llvm/include/llvm/IR/Module.h" +#include "external/llvm/include/llvm/IR/Value.h" +#include "tensorflow/compiler/xla/service/buffer_assignment.h" +#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" +#include "tensorflow/compiler/xla/service/name_uniquer.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace cpu { + +// This class is the top-level API for the XLA HLO --> LLVM IR compiler. It +// implements the DfsHloVisitor interface and emits HLO computations as LLVM IR +// functions. +class IrEmitter : public DfsHloVisitorWithDefault { + public: + // Create a new LLVM IR emitter. + // + // hlo_module: the HLO module we are emitting IR for. + // assignment: a BufferAssignment from which we know which temporary buffers + // are used by the HLO nodes. + // llvm_module: the LLVM module to emit IR into. + // hlo_to_profile_idx: the mapping from HLO to its index in the profiling + // array. + IrEmitter(const HloModule& hlo_module, const HloModuleConfig& module_config, + const BufferAssignment& assignment, llvm::Module* llvm_module, + const std::unordered_map<const HloInstruction*, size_t>* + hlo_to_profile_idx); + ~IrEmitter() override; + + // Emit and return the given HLO computation as an LLVM IR + // function. function_name_prefix is the desired name of the function. If the + // name is not unique among already emitted functions then a suffix is + // appended to make the name unique. is_entry_computation indicates that this + // is the entry computation of the HLO module. If 'instruction_order' is given + // then the HLO instructions are emitted in the given order. In this case, + // 'instruction_order' must be a topological sort of the set of nodes + // accessible from the root of the computation. + StatusOr<llvm::Function*> EmitComputation( + HloComputation* computation, const string& function_name_prefix, + bool is_entry_computation, + std::vector<const HloInstruction*>* instruction_order = nullptr); + + protected: + // + // The following methods implement the DfsHloVisitor interface. + // + // Default action which emits code for most operations. Operations which are + // special in some way are handled explicitly in HandleFoo methods. + Status DefaultAction(HloInstruction* hlo_instruction) override; + + Status HandleBitcast(HloInstruction* bitcast) override; + Status HandleConstant(HloInstruction* constant, + const Literal& literal) override; + Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override; + Status HandleGetTupleElement(HloInstruction* get_tuple_element, + HloInstruction* operand) override; + Status HandleSelect(HloInstruction* select, HloInstruction* pred, + HloInstruction* on_true, + HloInstruction* on_false) override; + Status HandleDot(HloInstruction* dot, HloInstruction* lhs, + HloInstruction* rhs) override; + Status HandleConvolution(HloInstruction* convolution, HloInstruction* lhs, + HloInstruction* rhs, const Window& window) override; + Status HandleCrossReplicaSum(HloInstruction* crs) override; + Status HandleInfeed(HloInstruction* infeed) override; + Status HandleSort(HloInstruction* sort, HloInstruction* operand) override; + Status HandleParameter(HloInstruction* parameter) override; + Status HandleReduce(HloInstruction* reduce, HloInstruction* arg, + HloInstruction* init_value, + tensorflow::gtl::ArraySlice<int64> dimensions, + HloComputation* function) override; + Status HandleReduceWindow(HloInstruction* reduce_window, + HloInstruction* operand, const Window& window, + HloComputation* function) override; + Status HandleSelectAndScatter(HloInstruction* instruction) override; + Status HandleSend(HloInstruction* send) override; + Status HandleRecv(HloInstruction* recv) override; + Status HandlePad(HloInstruction* pad) override; + Status HandleTuple( + HloInstruction* tuple, + tensorflow::gtl::ArraySlice<HloInstruction*> operands) override; + Status HandleMap( + HloInstruction* map, + tensorflow::gtl::ArraySlice<HloInstruction*> operands, + HloComputation* function, + tensorflow::gtl::ArraySlice<HloInstruction*> static_operands) override; + Status HandleFusion(HloInstruction* fusion) override; + Status HandleCall(HloInstruction* call, + tensorflow::gtl::ArraySlice<HloInstruction*> operands, + HloComputation* computation) override; + Status HandleCustomCall(HloInstruction* custom_call, + tensorflow::gtl::ArraySlice<HloInstruction*> operands, + tensorflow::StringPiece custom_call_target) override; + Status HandleWhile(HloInstruction* xla_while, HloInstruction* init, + HloComputation* condition, HloComputation* body) override; + Status FinishVisit(HloInstruction* root) override; + + Status Preprocess(HloInstruction* hlo) override; + Status Postprocess(HloInstruction* visited) override; + + private: + // Private helper to initialize an IR function for the computation. + void InitializeIrFunction(const string& function_name, + bool is_entry_computation); + + // Convenience function to generate a GEP into the profile counter parameter + // which would correspond to the index for a given HLO. + llvm::Value* GetProfileCounterFor(const HloInstruction* hlo); + + // Convenience function to get the IR Value emitted previously for the given + // hlo. Make sure to call it only when you're certain a value *was* emitted - + // if not found, this will log a fatal error. + llvm::Value* GetEmittedValueFor(const HloInstruction* hlo); + + // Convenience function to get an IrArray representing the given hlo. + llvm_ir::IrArray GetIrArrayForOp(const HloInstruction* hlo); + + // Augments IrArray with aliasing information. + void AddAliasingInformationToIrArray(const HloInstruction& hlo, + llvm_ir::IrArray* array) { + alias_analysis_.AddAliasingInformationToIrArray(hlo, array); + } + + // Convenience function to get the IR type matching the given shape. + llvm::Type* IrShapeType(const Shape& shape); + + // Get the llvm::Value* that represents the "retval" argument of the + // computation function being emitted by this emitter. + llvm::Argument* GetResultArgument(); + + // Get the llvm::Value* that represents the "prof_counters" argument of the + // computation function being emitted by this emitter. + llvm::Argument* GetProfileCountersArgument(); + + // Get the xla::ExecutableRunOptions that represents the "run_options" + // argument of the computation function being emitted by this emitter. + llvm::Value* GetExecutableRunOptionsArgument(); + + // Get the llvm::Value* that represents the "temps" argument of the + // computation function being emitted by this emitter. + llvm::Value* GetTempBuffersArgument(); + + // Emits code that computes the address of the given temporary buffer to the + // function. target_shape is the shape of this temporary buffer. + // The returned Value's type is a pointer to element_type. + llvm::Value* EmitTempBufferPointer(BufferAllocation::Index temp_buf_index, + const Shape& target_shape); + + // Emits a function into the current module. This can be used for + // computations embedded inside other computations, such as the + // function that a map operation applies. + StatusOr<llvm::Function*> EmitFunction( + HloComputation* function, // The function to emit. + tensorflow::StringPiece + function_name_suffix); // Used for LLVM IR register names. + + // Methods that emit a function call. + // Parameters: + // function - The LLVM function to call. + // return_shape - The return shape of the HLO computation that was used to + // make the function. Not the same as the return type of the function + // in LLVM, since we use output parameters for the return type. + // element_count - number of elements to return (array form only). + // parameter_addresses - pointers to be passed to the function as + // parameters. + // name - used for LLVM IR register names. + + // Emits a function call, returning a scalar, often an element of a larger + // array. Returns a Value for the scalar element returned by the function. + llvm::Value* EmitElementFunctionCall( + llvm::Function* function, const Shape& return_shape, + tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses, + tensorflow::StringPiece name); + + // Array function call emitter. Stores the function's result into a supplied + // buffer. + // Parameters: + // function - The LLVM function to call. + // parameter_addresses - pointers to be passed to the function as + // parameters. + // return_value - pointer to a buffer where the call result is stored. + + void EmitArrayFunctionCallInto( + llvm::Function* function, + tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses, + llvm::Value* return_value, tensorflow::StringPiece name); + + // Array function call emitter. Returns a Value for the function's return + // value buffer address. The return value buffer is alloca'ed by this + // function. + llvm::Value* EmitArrayFunctionCall( + llvm::Function* function, const Shape& return_shape, int64 element_count, + tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses, + tensorflow::StringPiece name); + + // Verifies that the element types of all of the given operand instructions + // match and are of one of the given supported types. + Status ElementTypesSameAndSupported( + const HloInstruction& instruction, + tensorflow::gtl::ArraySlice<const HloInstruction*> operands, + tensorflow::gtl::ArraySlice<PrimitiveType> supported_types); + + // Emit IR to perform a computation for every element in the given target op. + // This produces a series of nested loops (one for each dimension of the op's + // shape). The body of the inner-most loop is provided by the body_emitter + // function. + // + // TODO(jingyue): target_op should be a `const HloInstruction*`. + Status EmitTargetElementLoop( + HloInstruction* target_op, + const llvm_ir::ElementGenerator& element_generator); + + // Emits a memcpy from the source instruction's result value to the + // destination's. Both source and destination must have an entry in the + // emitted_value_ table. + Status EmitMemcpy(const HloInstruction& source, + const HloInstruction& destination); + + // Emit IR to compute the target address of the buffer for the given op. + // The returned Value is a pointer to a IR type that represents the op's + // element type. + StatusOr<llvm::Value*> EmitTargetAddressForOp(const HloInstruction* op); + + // Structurizes "array_elements" into an MD array that represents "shape". + // This is a recursive function, and "dimension_index" indicates the index of + // the current dimension that the function is considering (0 means the + // most-minor dimension). + llvm::Constant* CreateInitializerForConstantArray( + const std::vector<llvm::Constant*>& array_elements, const Shape& shape, + int64 dimension_index); + + // Name of the computation entry function. This function serves as the + // top-level "main" of the computation and will be invoked by the JIT. + string entry_function_name_; + + // Assignment of the temporary buffers needed by the computation and their + // shape information. + const BufferAssignment& assignment_; + + // The LLVM module into which IR will be emitted. + llvm::Module* module_; + + // The target architecture. + llvm::Triple::ArchType arch_type_; + + // Used to produce unique names for generated functions. + NameUniquer name_uniquer_; + + // Map containing all previously emitted computations. + std::map<HloComputation*, llvm::Function*> emitted_functions_; + + // Map containing all previously emitted thread-local temporary buffers. + std::map<std::pair<llvm::Function*, BufferAllocation::Index>, + llvm::AllocaInst*> + thread_local_buffers_; + + // The following fields track the IR emission state. According to LLVM memory + // management rules, their memory is owned by the module. + llvm::Function* compute_function_; + llvm::IRBuilder<> ir_builder_; + + // Maps HLOs to their index into the profile counter array. + const std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_; + + // Maps HLOs to Values emitted for them. + std::unordered_map<const HloInstruction*, llvm::Value*> emitted_value_; + + llvm_ir::AliasAnalysis alias_analysis_; + + // This struct contains all the state needed to emit instructions for + // profiling a computation. + class ProfilingState { + public: + ProfilingState() + : is_entry_computation_(false), + use_rdtscp_(false), + prof_counters_(nullptr) {} + ProfilingState(bool is_entry_computation, bool use_rdtscp, + llvm::Argument* prof_counters) + : is_entry_computation_(is_entry_computation), + use_rdtscp_(use_rdtscp), + prof_counters_(prof_counters) {} + + // Record the cycle counter before an HLO executes. + void RecordCycleStart(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo); + // Record the number of cycles it took for an HLO to execute. + void RecordCycleDelta(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo, + llvm::Value* prof_counter); + // Record the number of cycles it took for the entire computation to + // execute. + void RecordCompleteComputation(llvm::IRBuilder<>* ir_builder, + llvm::Value* prof_counter); + + // Convenience function to generate a call to an intrinsic which reads the + // CPU cycle counter. + llvm::Value* ReadCycleCounter(llvm::IRBuilder<>* ir_builder); + + // Store the cycle counter delta to the per-HLO profile counter. + void UpdateProfileCounter(llvm::IRBuilder<>* ir_builder, + llvm::Value* prof_counter, llvm::Value* cycle_end, + llvm::Value* cycle_start); + + private: + // Is this IrEmitter for a top-level computation? + bool is_entry_computation_; + + // Should we use the x86-specific rdtscp or the generic readcyclecounter + // intrinsic? + bool use_rdtscp_; + + // The argument which corresponds to the profile counter buffer. + llvm::Argument* prof_counters_; + + // The first read cycle counter in the program. + llvm::Value* first_read_cycle_start_ = nullptr; + + // The last read cycle counter in the program. + llvm::Value* last_read_cycle_end_ = nullptr; + + // An alloca used to hold the output of the aux value returned by the rdtscp + // intrinsic. + llvm::Value* aux_i8ptr_ = nullptr; + + // Maps HLOs to the value the cycle counter contained right before the HLO + // began to execute. + std::unordered_map<const HloInstruction*, llvm::Value*> cycle_starts_; + }; + + ProfilingState profiling_state_; + + // Given a load instruction and a shape or buffer size, annotate the load's + // result with the alignment required by the shape or size. + void AttachAlignmentMetadataForLoad(llvm::LoadInst* load, const Shape& shape); + void AttachAlignmentMetadataForLoad(llvm::LoadInst* load, int64 buffer_size); + + // Given a load instruction and a shape or buffer size, annotate the load's + // result with the dereferenceable bytes required by the shape / buffer size. + void AttachDereferenceableMetadataForLoad(llvm::LoadInst* load, + const Shape& shape); + void AttachDereferenceableMetadataForLoad(llvm::LoadInst* load, + int64 buffer_size); + + // Calculate the alignment of a buffer allocated for a given shape. + int MinimumAlignmentForShape(const Shape& shape); + + // Calculate the alignment of a buffer allocated for a given primitive type. + int MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type); + + // Calculate the alignment of a buffer with a particular size. + int MinimumAlignmentForBufferSize(int64 buffer_size); + + // Returns the number of bytes within the shape. + int64 ByteSizeOf(const Shape& shape) const; + + const HloModuleConfig& hlo_module_config_; + + TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter); +}; + +} // namespace cpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_ |