aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/cpu/ir_emitter.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/cpu/ir_emitter.h')
-rw-r--r--tensorflow/compiler/xla/service/cpu/ir_emitter.h402
1 files changed, 402 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
new file mode 100644
index 0000000000..06415c735d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -0,0 +1,402 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_
+
+#include <stddef.h>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "external/llvm/include/llvm/ADT/Triple.h"
+#include "external/llvm/include/llvm/IR/Function.h"
+#include "external/llvm/include/llvm/IR/IRBuilder.h"
+#include "external/llvm/include/llvm/IR/Module.h"
+#include "external/llvm/include/llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace cpu {
+
+// This class is the top-level API for the XLA HLO --> LLVM IR compiler. It
+// implements the DfsHloVisitor interface and emits HLO computations as LLVM IR
+// functions.
+class IrEmitter : public DfsHloVisitorWithDefault {
+ public:
+ // Create a new LLVM IR emitter.
+ //
+ // hlo_module: the HLO module we are emitting IR for.
+ // assignment: a BufferAssignment from which we know which temporary buffers
+ // are used by the HLO nodes.
+ // llvm_module: the LLVM module to emit IR into.
+ // hlo_to_profile_idx: the mapping from HLO to its index in the profiling
+ // array.
+ IrEmitter(const HloModule& hlo_module, const HloModuleConfig& module_config,
+ const BufferAssignment& assignment, llvm::Module* llvm_module,
+ const std::unordered_map<const HloInstruction*, size_t>*
+ hlo_to_profile_idx);
+ ~IrEmitter() override;
+
+ // Emit and return the given HLO computation as an LLVM IR
+ // function. function_name_prefix is the desired name of the function. If the
+ // name is not unique among already emitted functions then a suffix is
+ // appended to make the name unique. is_entry_computation indicates that this
+ // is the entry computation of the HLO module. If 'instruction_order' is given
+ // then the HLO instructions are emitted in the given order. In this case,
+ // 'instruction_order' must be a topological sort of the set of nodes
+ // accessible from the root of the computation.
+ StatusOr<llvm::Function*> EmitComputation(
+ HloComputation* computation, const string& function_name_prefix,
+ bool is_entry_computation,
+ std::vector<const HloInstruction*>* instruction_order = nullptr);
+
+ protected:
+ //
+ // The following methods implement the DfsHloVisitor interface.
+ //
+ // Default action which emits code for most operations. Operations which are
+ // special in some way are handled explicitly in HandleFoo methods.
+ Status DefaultAction(HloInstruction* hlo_instruction) override;
+
+ Status HandleBitcast(HloInstruction* bitcast) override;
+ Status HandleConstant(HloInstruction* constant,
+ const Literal& literal) override;
+ Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
+ Status HandleGetTupleElement(HloInstruction* get_tuple_element,
+ HloInstruction* operand) override;
+ Status HandleSelect(HloInstruction* select, HloInstruction* pred,
+ HloInstruction* on_true,
+ HloInstruction* on_false) override;
+ Status HandleDot(HloInstruction* dot, HloInstruction* lhs,
+ HloInstruction* rhs) override;
+ Status HandleConvolution(HloInstruction* convolution, HloInstruction* lhs,
+ HloInstruction* rhs, const Window& window) override;
+ Status HandleCrossReplicaSum(HloInstruction* crs) override;
+ Status HandleInfeed(HloInstruction* infeed) override;
+ Status HandleSort(HloInstruction* sort, HloInstruction* operand) override;
+ Status HandleParameter(HloInstruction* parameter) override;
+ Status HandleReduce(HloInstruction* reduce, HloInstruction* arg,
+ HloInstruction* init_value,
+ tensorflow::gtl::ArraySlice<int64> dimensions,
+ HloComputation* function) override;
+ Status HandleReduceWindow(HloInstruction* reduce_window,
+ HloInstruction* operand, const Window& window,
+ HloComputation* function) override;
+ Status HandleSelectAndScatter(HloInstruction* instruction) override;
+ Status HandleSend(HloInstruction* send) override;
+ Status HandleRecv(HloInstruction* recv) override;
+ Status HandlePad(HloInstruction* pad) override;
+ Status HandleTuple(
+ HloInstruction* tuple,
+ tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+ Status HandleMap(
+ HloInstruction* map,
+ tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+ HloComputation* function,
+ tensorflow::gtl::ArraySlice<HloInstruction*> static_operands) override;
+ Status HandleFusion(HloInstruction* fusion) override;
+ Status HandleCall(HloInstruction* call,
+ tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+ HloComputation* computation) override;
+ Status HandleCustomCall(HloInstruction* custom_call,
+ tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+ tensorflow::StringPiece custom_call_target) override;
+ Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
+ HloComputation* condition, HloComputation* body) override;
+ Status FinishVisit(HloInstruction* root) override;
+
+ Status Preprocess(HloInstruction* hlo) override;
+ Status Postprocess(HloInstruction* visited) override;
+
+ private:
+ // Private helper to initialize an IR function for the computation.
+ void InitializeIrFunction(const string& function_name,
+ bool is_entry_computation);
+
+ // Convenience function to generate a GEP into the profile counter parameter
+ // which would correspond to the index for a given HLO.
+ llvm::Value* GetProfileCounterFor(const HloInstruction* hlo);
+
+ // Convenience function to get the IR Value emitted previously for the given
+ // hlo. Make sure to call it only when you're certain a value *was* emitted -
+ // if not found, this will log a fatal error.
+ llvm::Value* GetEmittedValueFor(const HloInstruction* hlo);
+
+ // Convenience function to get an IrArray representing the given hlo.
+ llvm_ir::IrArray GetIrArrayForOp(const HloInstruction* hlo);
+
+ // Augments IrArray with aliasing information.
+ void AddAliasingInformationToIrArray(const HloInstruction& hlo,
+ llvm_ir::IrArray* array) {
+ alias_analysis_.AddAliasingInformationToIrArray(hlo, array);
+ }
+
+ // Convenience function to get the IR type matching the given shape.
+ llvm::Type* IrShapeType(const Shape& shape);
+
+ // Get the llvm::Value* that represents the "retval" argument of the
+ // computation function being emitted by this emitter.
+ llvm::Argument* GetResultArgument();
+
+ // Get the llvm::Value* that represents the "prof_counters" argument of the
+ // computation function being emitted by this emitter.
+ llvm::Argument* GetProfileCountersArgument();
+
+ // Get the xla::ExecutableRunOptions that represents the "run_options"
+ // argument of the computation function being emitted by this emitter.
+ llvm::Value* GetExecutableRunOptionsArgument();
+
+ // Get the llvm::Value* that represents the "temps" argument of the
+ // computation function being emitted by this emitter.
+ llvm::Value* GetTempBuffersArgument();
+
+ // Emits code that computes the address of the given temporary buffer to the
+ // function. target_shape is the shape of this temporary buffer.
+ // The returned Value's type is a pointer to element_type.
+ llvm::Value* EmitTempBufferPointer(BufferAllocation::Index temp_buf_index,
+ const Shape& target_shape);
+
+ // Emits a function into the current module. This can be used for
+ // computations embedded inside other computations, such as the
+ // function that a map operation applies.
+ StatusOr<llvm::Function*> EmitFunction(
+ HloComputation* function, // The function to emit.
+ tensorflow::StringPiece
+ function_name_suffix); // Used for LLVM IR register names.
+
+ // Methods that emit a function call.
+ // Parameters:
+ // function - The LLVM function to call.
+ // return_shape - The return shape of the HLO computation that was used to
+ // make the function. Not the same as the return type of the function
+ // in LLVM, since we use output parameters for the return type.
+ // element_count - number of elements to return (array form only).
+ // parameter_addresses - pointers to be passed to the function as
+ // parameters.
+ // name - used for LLVM IR register names.
+
+ // Emits a function call, returning a scalar, often an element of a larger
+ // array. Returns a Value for the scalar element returned by the function.
+ llvm::Value* EmitElementFunctionCall(
+ llvm::Function* function, const Shape& return_shape,
+ tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+ tensorflow::StringPiece name);
+
+ // Array function call emitter. Stores the function's result into a supplied
+ // buffer.
+ // Parameters:
+ // function - The LLVM function to call.
+ // parameter_addresses - pointers to be passed to the function as
+ // parameters.
+ // return_value - pointer to a buffer where the call result is stored.
+
+ void EmitArrayFunctionCallInto(
+ llvm::Function* function,
+ tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+ llvm::Value* return_value, tensorflow::StringPiece name);
+
+ // Array function call emitter. Returns a Value for the function's return
+ // value buffer address. The return value buffer is alloca'ed by this
+ // function.
+ llvm::Value* EmitArrayFunctionCall(
+ llvm::Function* function, const Shape& return_shape, int64 element_count,
+ tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+ tensorflow::StringPiece name);
+
+ // Verifies that the element types of all of the given operand instructions
+ // match and are of one of the given supported types.
+ Status ElementTypesSameAndSupported(
+ const HloInstruction& instruction,
+ tensorflow::gtl::ArraySlice<const HloInstruction*> operands,
+ tensorflow::gtl::ArraySlice<PrimitiveType> supported_types);
+
+ // Emit IR to perform a computation for every element in the given target op.
+ // This produces a series of nested loops (one for each dimension of the op's
+ // shape). The body of the inner-most loop is provided by the body_emitter
+ // function.
+ //
+ // TODO(jingyue): target_op should be a `const HloInstruction*`.
+ Status EmitTargetElementLoop(
+ HloInstruction* target_op,
+ const llvm_ir::ElementGenerator& element_generator);
+
+ // Emits a memcpy from the source instruction's result value to the
+ // destination's. Both source and destination must have an entry in the
+ // emitted_value_ table.
+ Status EmitMemcpy(const HloInstruction& source,
+ const HloInstruction& destination);
+
+ // Emit IR to compute the target address of the buffer for the given op.
+ // The returned Value is a pointer to a IR type that represents the op's
+ // element type.
+ StatusOr<llvm::Value*> EmitTargetAddressForOp(const HloInstruction* op);
+
+ // Structurizes "array_elements" into an MD array that represents "shape".
+ // This is a recursive function, and "dimension_index" indicates the index of
+ // the current dimension that the function is considering (0 means the
+ // most-minor dimension).
+ llvm::Constant* CreateInitializerForConstantArray(
+ const std::vector<llvm::Constant*>& array_elements, const Shape& shape,
+ int64 dimension_index);
+
+ // Name of the computation entry function. This function serves as the
+ // top-level "main" of the computation and will be invoked by the JIT.
+ string entry_function_name_;
+
+ // Assignment of the temporary buffers needed by the computation and their
+ // shape information.
+ const BufferAssignment& assignment_;
+
+ // The LLVM module into which IR will be emitted.
+ llvm::Module* module_;
+
+ // The target architecture.
+ llvm::Triple::ArchType arch_type_;
+
+ // Used to produce unique names for generated functions.
+ NameUniquer name_uniquer_;
+
+ // Map containing all previously emitted computations.
+ std::map<HloComputation*, llvm::Function*> emitted_functions_;
+
+ // Map containing all previously emitted thread-local temporary buffers.
+ std::map<std::pair<llvm::Function*, BufferAllocation::Index>,
+ llvm::AllocaInst*>
+ thread_local_buffers_;
+
+ // The following fields track the IR emission state. According to LLVM memory
+ // management rules, their memory is owned by the module.
+ llvm::Function* compute_function_;
+ llvm::IRBuilder<> ir_builder_;
+
+ // Maps HLOs to their index into the profile counter array.
+ const std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_;
+
+ // Maps HLOs to Values emitted for them.
+ std::unordered_map<const HloInstruction*, llvm::Value*> emitted_value_;
+
+ llvm_ir::AliasAnalysis alias_analysis_;
+
+ // This struct contains all the state needed to emit instructions for
+ // profiling a computation.
+ class ProfilingState {
+ public:
+ ProfilingState()
+ : is_entry_computation_(false),
+ use_rdtscp_(false),
+ prof_counters_(nullptr) {}
+ ProfilingState(bool is_entry_computation, bool use_rdtscp,
+ llvm::Argument* prof_counters)
+ : is_entry_computation_(is_entry_computation),
+ use_rdtscp_(use_rdtscp),
+ prof_counters_(prof_counters) {}
+
+ // Record the cycle counter before an HLO executes.
+ void RecordCycleStart(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo);
+ // Record the number of cycles it took for an HLO to execute.
+ void RecordCycleDelta(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo,
+ llvm::Value* prof_counter);
+ // Record the number of cycles it took for the entire computation to
+ // execute.
+ void RecordCompleteComputation(llvm::IRBuilder<>* ir_builder,
+ llvm::Value* prof_counter);
+
+ // Convenience function to generate a call to an intrinsic which reads the
+ // CPU cycle counter.
+ llvm::Value* ReadCycleCounter(llvm::IRBuilder<>* ir_builder);
+
+ // Store the cycle counter delta to the per-HLO profile counter.
+ void UpdateProfileCounter(llvm::IRBuilder<>* ir_builder,
+ llvm::Value* prof_counter, llvm::Value* cycle_end,
+ llvm::Value* cycle_start);
+
+ private:
+ // Is this IrEmitter for a top-level computation?
+ bool is_entry_computation_;
+
+ // Should we use the x86-specific rdtscp or the generic readcyclecounter
+ // intrinsic?
+ bool use_rdtscp_;
+
+ // The argument which corresponds to the profile counter buffer.
+ llvm::Argument* prof_counters_;
+
+ // The first read cycle counter in the program.
+ llvm::Value* first_read_cycle_start_ = nullptr;
+
+ // The last read cycle counter in the program.
+ llvm::Value* last_read_cycle_end_ = nullptr;
+
+ // An alloca used to hold the output of the aux value returned by the rdtscp
+ // intrinsic.
+ llvm::Value* aux_i8ptr_ = nullptr;
+
+ // Maps HLOs to the value the cycle counter contained right before the HLO
+ // began to execute.
+ std::unordered_map<const HloInstruction*, llvm::Value*> cycle_starts_;
+ };
+
+ ProfilingState profiling_state_;
+
+ // Given a load instruction and a shape or buffer size, annotate the load's
+ // result with the alignment required by the shape or size.
+ void AttachAlignmentMetadataForLoad(llvm::LoadInst* load, const Shape& shape);
+ void AttachAlignmentMetadataForLoad(llvm::LoadInst* load, int64 buffer_size);
+
+ // Given a load instruction and a shape or buffer size, annotate the load's
+ // result with the dereferenceable bytes required by the shape / buffer size.
+ void AttachDereferenceableMetadataForLoad(llvm::LoadInst* load,
+ const Shape& shape);
+ void AttachDereferenceableMetadataForLoad(llvm::LoadInst* load,
+ int64 buffer_size);
+
+ // Calculate the alignment of a buffer allocated for a given shape.
+ int MinimumAlignmentForShape(const Shape& shape);
+
+ // Calculate the alignment of a buffer allocated for a given primitive type.
+ int MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type);
+
+ // Calculate the alignment of a buffer with a particular size.
+ int MinimumAlignmentForBufferSize(int64 buffer_size);
+
+ // Returns the number of bytes within the shape.
+ int64 ByteSizeOf(const Shape& shape) const;
+
+ const HloModuleConfig& hlo_module_config_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter);
+};
+
+} // namespace cpu
+} // namespace xla
+
+#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_