blob: feb177a7da9e564ccf417e21050486858b06822f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
syntax = "proto3";
package tensorflow.tpu.op_profile;
// Profile is the top-level data that summarizes a program.
message Profile {
// Root of a profile broken down by instruction category.
Node by_category = 1;
// Root of a profile broken down by program structure.
Node by_program_structure = 2;
// Per program profile, indexed by hlo module name of the program.
map<string, Node> per_program = 3;
}
// An entry in the profile tree. (An instruction, or set of instructions).
message Node {
string name = 1; // Semantics depend on contents.
Metrics metrics = 2; // May be omitted e.g. for fused instructions.
repeated Node children = 3;
// Details about what this node represents.
oneof contents {
InstructionCategory category = 4;
XLAInstruction xla = 5;
}
// A category of XLA instructions.
// name is a descriptive string, like "data formatting".
message InstructionCategory {
}
// A single XLA instruction.
// name is the unique instruction id, like "%multiply.5".
message XLAInstruction {
string op = 1; // Opcode like %multiply
string expression = 2; // %multiply = [shape]multiply(operand1, operand2)
string provenance = 3; // Typically the TensorFlow operation name.
string category = 4;
// Describes the physical memory layout of the instruction's primary input.
// e.g. for a convolution, this analyzes the image and ignores the kernel.
LayoutAnalysis layout = 5;
message LayoutAnalysis {
// The physical data layout, from most-minor to most-major dimensions.
repeated Dimension dimensions = 1;
message Dimension {
int32 size = 1; // Size of the data in this dimension.
int32 alignment = 2; // Data must be padded to a multiple of alignment.
string semantics = 3; // What the dimension represents, e.g. "spatial".
}
}
}
}
// Measurements of an operation (or aggregated set of operations).
// Metrics are always "total" rather than "self".
message Metrics {
// Core-time taken by this operation, as a fraction of all operations.
double time = 1;
// Floating point computations performed by this operation, as a fraction of
// peak core FLOPS * program time. This representation has useful properties:
// - it is proportional to the number of floating point operations performed
// - utilization is flops/time
// - wasted potential flops is proportional to time - flops
// - it does not reveal the peak core FLOPS of the hardware
double flops = 2;
// The VMEM bandwidth used to load operands from HBM, as a fraction of
// thereotical VMEM bandwidth on the specific hardware.
double memory_bandwidth = 3;
double raw_time = 11; // Elapsed core-time in picoseconds.
double raw_flops = 12; // Total floating-point operations performed.
double raw_bytes_accessed = 13; // Total bytes accessed (include read/write).
}
|