aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/tensorboard/db/schema.cc
diff options
context:
space:
mode:
authorGravatar Justine Tunney <jart@google.com>2018-01-11 16:08:50 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-01-11 16:12:46 -0800
commitfebdd26ae594133d24f82544706b1e012a5cf1ea (patch)
treedd325008019ab10ce35f98368bf392ce4a118ec9 /tensorflow/contrib/tensorboard/db/schema.cc
parentfc252eb976c98c95a625ea6e6a0486334d3c5b6e (diff)
Add reservoir sampling to DB summary writer
This thing is kind of cool. It's able to turn a 350mB event log into a 35mB SQLite file at 80mBps with one Macbook core. Best of all, this was accomplished using a normalized schema without the embedded protos. PiperOrigin-RevId: 181676380
Diffstat (limited to 'tensorflow/contrib/tensorboard/db/schema.cc')
-rw-r--r--tensorflow/contrib/tensorboard/db/schema.cc239
1 files changed, 129 insertions, 110 deletions
diff --git a/tensorflow/contrib/tensorboard/db/schema.cc b/tensorflow/contrib/tensorboard/db/schema.cc
index 2cd00876f8..6ccd386dc0 100644
--- a/tensorflow/contrib/tensorboard/db/schema.cc
+++ b/tensorflow/contrib/tensorboard/db/schema.cc
@@ -22,8 +22,7 @@ namespace {
Status Run(Sqlite* db, const char* sql) {
SqliteStatement stmt;
TF_RETURN_IF_ERROR(db->Prepare(sql, &stmt));
- TF_RETURN_IF_ERROR(stmt.StepAndReset());
- return Status::OK();
+ return stmt.StepAndReset();
}
} // namespace
@@ -38,37 +37,34 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
db->PrepareOrDie("PRAGMA user_version=0").StepAndResetOrDie();
Status s;
- // Creates Ids table.
+ // Ids identify resources.
//
- // This table must be used to randomly allocate Permanent IDs for
- // all top-level tables, in order to maintain an invariant where
- // foo_id != bar_id for all IDs of any two tables.
+ // This table can be used to efficiently generate Permanent IDs in
+ // conjunction with a random number generator. Unlike rowids these
+ // IDs safe to use in URLs and unique across tables.
//
- // A row should only be deleted from this table if it can be
- // guaranteed that it exists absolutely nowhere else in the entire
- // system.
+ // Within any given system, there can't be any foo_id == bar_id for
+ // all rows of any two (Foos, Bars) tables. A row should only be
+ // deleted from this table if there's a very high level of confidence
+ // it exists nowhere else in the system.
//
// Fields:
- // id: An ID that was allocated globally. This must be in the
- // range [1,2**47). 0 is assigned the same meaning as NULL and
- // shouldn't be stored; 2**63-1 is reserved for statically
- // allocating space in a page to UPDATE later; and all other
- // int64 values are reserved for future use.
+ // id: The system-wide ID. This must be in the range [1,2**47). 0
+ // is assigned the same meaning as NULL and shouldn't be stored
+ // and all other int64 values are reserved for future use. Please
+ // note that id is also the rowid.
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS Ids (
id INTEGER PRIMARY KEY
)
)sql"));
- // Creates Descriptions table.
- //
- // This table allows TensorBoard to associate Markdown text with any
- // object in the database that has a Permanent ID.
+ // Descriptions are Markdown text that can be associated with any
+ // resource that has a Permanent ID.
//
// Fields:
- // id: The Permanent ID of the associated object. This is also the
- // SQLite rowid.
- // description: Arbitrary Markdown text.
+ // id: The foo_id of the associated row in Foos.
+ // description: Arbitrary NUL-terminated Markdown text.
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS Descriptions (
id INTEGER PRIMARY KEY,
@@ -76,121 +72,136 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
)
)sql"));
- // Creates Tensors table.
+ // Tensors are 0..n-dimensional numbers or strings.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
- // tag_id: ID of associated Tag.
+ // rowid: Ephemeral b-tree ID.
+ // series: The Permanent ID of a different resource, e.g. tag_id. A
+ // tensor will be vacuumed if no series == foo_id exists for all
+ // rows of all Foos. When series is NULL this tensor may serve
+ // undefined purposes. This field should be set on placeholders.
+ // step: Arbitrary number to uniquely order tensors within series.
+ // The meaning of step is undefined when series is NULL. This may
+ // be set on placeholders to prepopulate index pages.
// computed_time: Float UNIX timestamp with microsecond precision.
// In the old summaries system that uses FileWriter, this is the
// wall time around when tf.Session.run finished. In the new
// summaries system, it is the wall time of when the tensor was
// computed. On systems with monotonic clocks, it is calculated
// by adding the monotonic run duration to Run.started_time.
- // This field is not indexed because, in practice, it should be
- // ordered the same or nearly the same as TensorIndex, so local
- // insertion sort might be more suitable.
- // step: User-supplied number, ordering this tensor in Tag.
- // If NULL then the Tag must have only one Tensor.
- // tensor: Can be an INTEGER (DT_INT64), FLOAT (DT_DOUBLE), or
- // BLOB. The structure of a BLOB is currently undefined, but in
- // essence it is a Snappy tf.TensorProto that spills over into
- // TensorChunks.
+ // dtype: The tensorflow::DataType ID. For example, DT_INT64 is 9.
+ // When NULL or 0 this must be treated as a placeholder row that
+ // does not officially exist.
+ // shape: A comma-delimited list of int64 >=0 values representing
+ // length of each dimension in the tensor. This must be a valid
+ // shape. That means no -1 values and, in the case of numeric
+ // tensors, length(data) == product(shape) * sizeof(dtype). Empty
+ // means this is a scalar a.k.a. 0-dimensional tensor.
+ // data: Little-endian raw tensor memory. If dtype is DT_STRING and
+ // shape is empty, the nullness of this field indicates whether or
+ // not it contains the tensor contents; otherwise TensorStrings
+ // must be queried. If dtype is NULL then ZEROBLOB can be used on
+ // this field to reserve row space to be updated later.
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS Tensors (
rowid INTEGER PRIMARY KEY,
- tag_id INTEGER NOT NULL,
- computed_time REAL,
+ series INTEGER,
step INTEGER,
- tensor BLOB
+ dtype INTEGER,
+ computed_time REAL,
+ shape TEXT,
+ data BLOB
)
)sql"));
- // Uniquely indexes (tag_id, step) on Tensors table.
s.Update(Run(db, R"sql(
- CREATE UNIQUE INDEX IF NOT EXISTS TensorIndex
- ON Tensors (tag_id, step)
+ CREATE UNIQUE INDEX IF NOT EXISTS
+ TensorSeriesStepIndex
+ ON
+ Tensors (series, step)
+ WHERE
+ series IS NOT NULL
+ AND step IS NOT NULL
)sql"));
- // Creates TensorChunks table.
+ // TensorStrings are the flat contents of 1..n dimensional DT_STRING
+ // Tensors.
//
- // This table can be used to split up a tensor across many rows,
- // which has the advantage of not slowing down table scans on the
- // main table, allowing asynchronous fetching, minimizing copying,
- // and preventing large buffers from being allocated.
+ // The number of rows associated with a Tensor must be equal to the
+ // product of its Tensors.shape.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
- // tag_id: ID of associated Tag.
- // step: Same as corresponding Tensors.step.
- // sequence: 1-indexed sequence number for ordering chunks. Please
- // note that the 0th index is Tensors.tensor.
- // chunk: Bytes of next chunk in tensor.
+ // rowid: Ephemeral b-tree ID.
+ // tensor_rowid: References Tensors.rowid.
+ // idx: Index in flattened tensor, starting at 0.
+ // data: The string value at a particular index. NUL characters are
+ // permitted.
s.Update(Run(db, R"sql(
- CREATE TABLE IF NOT EXISTS TensorChunks (
+ CREATE TABLE IF NOT EXISTS TensorStrings (
rowid INTEGER PRIMARY KEY,
- tag_id INTEGER NOT NULL,
- step INTEGER,
- sequence INTEGER,
- chunk BLOB
+ tensor_rowid INTEGER NOT NULL,
+ idx INTEGER NOT NULL,
+ data BLOB
)
)sql"));
- // Uniquely indexes (tag_id, step, sequence) on TensorChunks table.
s.Update(Run(db, R"sql(
- CREATE UNIQUE INDEX IF NOT EXISTS TensorChunkIndex
- ON TensorChunks (tag_id, step, sequence)
+ CREATE UNIQUE INDEX IF NOT EXISTS TensorStringIndex
+ ON TensorStrings (tensor_rowid, idx)
)sql"));
- // Creates Tags table.
+ // Tags are series of Tensors.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
+ // rowid: Ephemeral b-tree ID.
// tag_id: The Permanent ID of the Tag.
// run_id: Optional ID of associated Run.
- // tag_name: The tag field in summary.proto, unique across Run.
// inserted_time: Float UNIX timestamp with µs precision. This is
// always the wall time of when the row was inserted into the
// DB. It may be used as a hint for an archival job.
+ // tag_name: The tag field in summary.proto, unique across Run.
// display_name: Optional for GUI and defaults to tag_name.
// plugin_name: Arbitrary TensorBoard plugin name for dispatch.
// plugin_data: Arbitrary data that plugin wants.
+ //
+ // TODO(jart): Maybe there should be a Plugins table?
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS Tags (
rowid INTEGER PRIMARY KEY,
run_id INTEGER,
tag_id INTEGER NOT NULL,
- tag_name TEXT,
inserted_time DOUBLE,
+ tag_name TEXT,
display_name TEXT,
plugin_name TEXT,
plugin_data BLOB
)
)sql"));
- // Uniquely indexes tag_id on Tags table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS TagIdIndex
ON Tags (tag_id)
)sql"));
- // Uniquely indexes (run_id, tag_name) on Tags table.
s.Update(Run(db, R"sql(
- CREATE UNIQUE INDEX IF NOT EXISTS TagNameIndex
- ON Tags (run_id, tag_name)
- WHERE tag_name IS NOT NULL
+ CREATE UNIQUE INDEX IF NOT EXISTS
+ TagRunNameIndex
+ ON
+ Tags (run_id, tag_name)
+ WHERE
+ run_id IS NOT NULL
+ AND tag_name IS NOT NULL
)sql"));
- // Creates Runs table.
+ // Runs are groups of Tags.
//
- // This table stores information about Runs. Each row usually
- // represents a single attempt at training or testing a TensorFlow
- // model, with a given set of hyper-parameters, whose summaries are
- // written out to a single event logs directory with a monotonic step
- // counter.
+ // Each Run usually represents a single attempt at training or testing
+ // a TensorFlow model, with a given set of hyper-parameters, whose
+ // summaries are written out to a single event logs directory with a
+ // monotonic step counter.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
+ // rowid: Ephemeral b-tree ID.
// run_id: The Permanent ID of the Run. This has a 1:1 mapping
// with a SummaryWriter instance. If two writers spawn for a
// given (user_name, run_name, run_name) then each should
@@ -199,8 +210,8 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
// previous invocations will then enter limbo, where they may be
// accessible for certain operations, but should be garbage
// collected eventually.
- // experiment_id: Optional ID of associated Experiment.
// run_name: User-supplied string, unique across Experiment.
+ // experiment_id: Optional ID of associated Experiment.
// inserted_time: Float UNIX timestamp with µs precision. This is
// always the time the row was inserted into the database. It
// does not change.
@@ -215,40 +226,33 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
// SummaryWriter resource that created this run was destroyed.
// Once this value becomes non-NULL a Run and its Tags and
// Tensors should be regarded as immutable.
- // graph_id: ID of associated Graphs row.
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS Runs (
rowid INTEGER PRIMARY KEY,
experiment_id INTEGER,
run_id INTEGER NOT NULL,
- run_name TEXT,
inserted_time REAL,
started_time REAL,
finished_time REAL,
- graph_id INTEGER
+ run_name TEXT
)
)sql"));
- // Uniquely indexes run_id on Runs table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS RunIdIndex
ON Runs (run_id)
)sql"));
- // Uniquely indexes (experiment_id, run_name) on Runs table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS RunNameIndex
ON Runs (experiment_id, run_name)
WHERE run_name IS NOT NULL
)sql"));
- // Creates Experiments table.
- //
- // This table stores information about experiments, which are sets of
- // runs.
+ // Experiments are groups of Runs.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
+ // rowid: Ephemeral b-tree ID.
// user_id: Optional ID of associated User.
// experiment_id: The Permanent ID of the Experiment.
// experiment_name: User-supplied string, unique across User.
@@ -259,34 +263,39 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
// the MIN(experiment.started_time, run.started_time) of each
// Run added to the database, including Runs which have since
// been overwritten.
+ // is_watching: A boolean indicating if someone is actively
+ // looking at this Experiment in the TensorBoard GUI. Tensor
+ // writers that do reservoir sampling can query this value to
+ // decide if they want the "keep last" behavior. This improves
+ // the performance of long running training while allowing low
+ // latency feedback in TensorBoard.
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS Experiments (
rowid INTEGER PRIMARY KEY,
user_id INTEGER,
experiment_id INTEGER NOT NULL,
- experiment_name TEXT,
inserted_time REAL,
- started_time REAL
+ started_time REAL,
+ is_watching INTEGER,
+ experiment_name TEXT
)
)sql"));
- // Uniquely indexes experiment_id on Experiments table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS ExperimentIdIndex
ON Experiments (experiment_id)
)sql"));
- // Uniquely indexes (user_id, experiment_name) on Experiments table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS ExperimentNameIndex
ON Experiments (user_id, experiment_name)
WHERE experiment_name IS NOT NULL
)sql"));
- // Creates Users table.
+ // Users are people who love TensorBoard.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
+ // rowid: Ephemeral b-tree ID.
// user_id: The Permanent ID of the User.
// user_name: Unique user name.
// email: Optional unique email address.
@@ -297,61 +306,66 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
CREATE TABLE IF NOT EXISTS Users (
rowid INTEGER PRIMARY KEY,
user_id INTEGER NOT NULL,
+ inserted_time REAL,
user_name TEXT,
- email TEXT,
- inserted_time REAL
+ email TEXT
)
)sql"));
- // Uniquely indexes user_id on Users table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS UserIdIndex
ON Users (user_id)
)sql"));
- // Uniquely indexes user_name on Users table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS UserNameIndex
ON Users (user_name)
WHERE user_name IS NOT NULL
)sql"));
- // Uniquely indexes email on Users table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS UserEmailIndex
ON Users (email)
WHERE email IS NOT NULL
)sql"));
- // Creates Graphs table.
+ // Graphs define how Tensors flowed in Runs.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
+ // rowid: Ephemeral b-tree ID.
+ // run_id: The Permanent ID of the associated Run. Only one Graph
+ // can be associated with a Run.
// graph_id: The Permanent ID of the Graph.
// inserted_time: Float UNIX timestamp with µs precision. This is
// always the wall time of when the row was inserted into the
// DB. It may be used as a hint for an archival job.
- // node_def: Contains Snappy tf.GraphDef proto. All fields will be
- // cleared except those not expressed in SQL.
+ // node_def: Contains tf.GraphDef proto. All fields will be cleared
+ // except those not expressed in SQL.
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS Graphs (
rowid INTEGER PRIMARY KEY,
+ run_id INTEGER,
graph_id INTEGER NOT NULL,
inserted_time REAL,
graph_def BLOB
)
)sql"));
- // Uniquely indexes graph_id on Graphs table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS GraphIdIndex
ON Graphs (graph_id)
)sql"));
- // Creates Nodes table.
+ s.Update(Run(db, R"sql(
+ CREATE UNIQUE INDEX IF NOT EXISTS GraphRunIndex
+ ON Graphs (run_id)
+ WHERE run_id IS NOT NULL
+ )sql"));
+
+ // Nodes are the vertices in Graphs.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
+ // rowid: Ephemeral b-tree ID.
// graph_id: The Permanent ID of the associated Graph.
// node_id: ID for this node. This is more like a 0-index within
// the Graph. Please note indexes are allowed to be removed.
@@ -361,8 +375,10 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
// node_def.name proto field must not be cleared.
// op: Copied from tf.NodeDef proto.
// device: Copied from tf.NodeDef proto.
- // node_def: Contains Snappy tf.NodeDef proto. All fields will be
- // cleared except those not expressed in SQL.
+ // node_def: Contains tf.NodeDef proto. All fields will be cleared
+ // except those not expressed in SQL.
+ //
+ // TODO(jart): Make separate tables for op and device strings.
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS Nodes (
rowid INTEGER PRIMARY KEY,
@@ -375,32 +391,35 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
)
)sql"));
- // Uniquely indexes (graph_id, node_id) on Nodes table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS NodeIdIndex
ON Nodes (graph_id, node_id)
)sql"));
- // Uniquely indexes (graph_id, node_name) on Nodes table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS NodeNameIndex
ON Nodes (graph_id, node_name)
WHERE node_name IS NOT NULL
)sql"));
- // Creates NodeInputs table.
+ // NodeInputs are directed edges between Nodes in Graphs.
//
// Fields:
- // rowid: Ephemeral b-tree ID dictating locality.
+ // rowid: Ephemeral b-tree ID.
// graph_id: The Permanent ID of the associated Graph.
// node_id: Index of Node in question. This can be considered the
// 'to' vertex.
// idx: Used for ordering inputs on a given Node.
// input_node_id: Nodes.node_id of the corresponding input node.
// This can be considered the 'from' vertex.
+ // input_node_idx: Since a Node can output multiple Tensors, this
+ // is the integer index of which of those outputs is our input.
+ // NULL is treated as 0.
// is_control: If non-zero, indicates this input is a controlled
// dependency, which means this isn't an edge through which
// tensors flow. NULL means 0.
+ //
+ // TODO(jart): Rename to NodeEdges.
s.Update(Run(db, R"sql(
CREATE TABLE IF NOT EXISTS NodeInputs (
rowid INTEGER PRIMARY KEY,
@@ -408,11 +427,11 @@ Status SetupTensorboardSqliteDb(Sqlite* db) {
node_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
input_node_id INTEGER NOT NULL,
+ input_node_idx INTEGER,
is_control INTEGER
)
)sql"));
- // Uniquely indexes (graph_id, node_id, idx) on NodeInputs table.
s.Update(Run(db, R"sql(
CREATE UNIQUE INDEX IF NOT EXISTS NodeInputsIndex
ON NodeInputs (graph_id, node_id, idx)