diff options
author | 2018-01-11 16:08:50 -0800 | |
---|---|---|
committer | 2018-01-11 16:12:46 -0800 | |
commit | febdd26ae594133d24f82544706b1e012a5cf1ea (patch) | |
tree | dd325008019ab10ce35f98368bf392ce4a118ec9 /tensorflow/contrib/tensorboard/db/schema.cc | |
parent | fc252eb976c98c95a625ea6e6a0486334d3c5b6e (diff) |
Add reservoir sampling to DB summary writer
This thing is kind of cool. It's able to turn a 350mB event log into a
35mB SQLite file at 80mBps with one Macbook core. Best of all, this was
accomplished using a normalized schema without the embedded protos.
PiperOrigin-RevId: 181676380
Diffstat (limited to 'tensorflow/contrib/tensorboard/db/schema.cc')
-rw-r--r-- | tensorflow/contrib/tensorboard/db/schema.cc | 239 |
1 files changed, 129 insertions, 110 deletions
diff --git a/tensorflow/contrib/tensorboard/db/schema.cc b/tensorflow/contrib/tensorboard/db/schema.cc index 2cd00876f8..6ccd386dc0 100644 --- a/tensorflow/contrib/tensorboard/db/schema.cc +++ b/tensorflow/contrib/tensorboard/db/schema.cc @@ -22,8 +22,7 @@ namespace { Status Run(Sqlite* db, const char* sql) { SqliteStatement stmt; TF_RETURN_IF_ERROR(db->Prepare(sql, &stmt)); - TF_RETURN_IF_ERROR(stmt.StepAndReset()); - return Status::OK(); + return stmt.StepAndReset(); } } // namespace @@ -38,37 +37,34 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { db->PrepareOrDie("PRAGMA user_version=0").StepAndResetOrDie(); Status s; - // Creates Ids table. + // Ids identify resources. // - // This table must be used to randomly allocate Permanent IDs for - // all top-level tables, in order to maintain an invariant where - // foo_id != bar_id for all IDs of any two tables. + // This table can be used to efficiently generate Permanent IDs in + // conjunction with a random number generator. Unlike rowids these + // IDs safe to use in URLs and unique across tables. // - // A row should only be deleted from this table if it can be - // guaranteed that it exists absolutely nowhere else in the entire - // system. + // Within any given system, there can't be any foo_id == bar_id for + // all rows of any two (Foos, Bars) tables. A row should only be + // deleted from this table if there's a very high level of confidence + // it exists nowhere else in the system. // // Fields: - // id: An ID that was allocated globally. This must be in the - // range [1,2**47). 0 is assigned the same meaning as NULL and - // shouldn't be stored; 2**63-1 is reserved for statically - // allocating space in a page to UPDATE later; and all other - // int64 values are reserved for future use. + // id: The system-wide ID. This must be in the range [1,2**47). 0 + // is assigned the same meaning as NULL and shouldn't be stored + // and all other int64 values are reserved for future use. Please + // note that id is also the rowid. s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS Ids ( id INTEGER PRIMARY KEY ) )sql")); - // Creates Descriptions table. - // - // This table allows TensorBoard to associate Markdown text with any - // object in the database that has a Permanent ID. + // Descriptions are Markdown text that can be associated with any + // resource that has a Permanent ID. // // Fields: - // id: The Permanent ID of the associated object. This is also the - // SQLite rowid. - // description: Arbitrary Markdown text. + // id: The foo_id of the associated row in Foos. + // description: Arbitrary NUL-terminated Markdown text. s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS Descriptions ( id INTEGER PRIMARY KEY, @@ -76,121 +72,136 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { ) )sql")); - // Creates Tensors table. + // Tensors are 0..n-dimensional numbers or strings. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. - // tag_id: ID of associated Tag. + // rowid: Ephemeral b-tree ID. + // series: The Permanent ID of a different resource, e.g. tag_id. A + // tensor will be vacuumed if no series == foo_id exists for all + // rows of all Foos. When series is NULL this tensor may serve + // undefined purposes. This field should be set on placeholders. + // step: Arbitrary number to uniquely order tensors within series. + // The meaning of step is undefined when series is NULL. This may + // be set on placeholders to prepopulate index pages. // computed_time: Float UNIX timestamp with microsecond precision. // In the old summaries system that uses FileWriter, this is the // wall time around when tf.Session.run finished. In the new // summaries system, it is the wall time of when the tensor was // computed. On systems with monotonic clocks, it is calculated // by adding the monotonic run duration to Run.started_time. - // This field is not indexed because, in practice, it should be - // ordered the same or nearly the same as TensorIndex, so local - // insertion sort might be more suitable. - // step: User-supplied number, ordering this tensor in Tag. - // If NULL then the Tag must have only one Tensor. - // tensor: Can be an INTEGER (DT_INT64), FLOAT (DT_DOUBLE), or - // BLOB. The structure of a BLOB is currently undefined, but in - // essence it is a Snappy tf.TensorProto that spills over into - // TensorChunks. + // dtype: The tensorflow::DataType ID. For example, DT_INT64 is 9. + // When NULL or 0 this must be treated as a placeholder row that + // does not officially exist. + // shape: A comma-delimited list of int64 >=0 values representing + // length of each dimension in the tensor. This must be a valid + // shape. That means no -1 values and, in the case of numeric + // tensors, length(data) == product(shape) * sizeof(dtype). Empty + // means this is a scalar a.k.a. 0-dimensional tensor. + // data: Little-endian raw tensor memory. If dtype is DT_STRING and + // shape is empty, the nullness of this field indicates whether or + // not it contains the tensor contents; otherwise TensorStrings + // must be queried. If dtype is NULL then ZEROBLOB can be used on + // this field to reserve row space to be updated later. s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS Tensors ( rowid INTEGER PRIMARY KEY, - tag_id INTEGER NOT NULL, - computed_time REAL, + series INTEGER, step INTEGER, - tensor BLOB + dtype INTEGER, + computed_time REAL, + shape TEXT, + data BLOB ) )sql")); - // Uniquely indexes (tag_id, step) on Tensors table. s.Update(Run(db, R"sql( - CREATE UNIQUE INDEX IF NOT EXISTS TensorIndex - ON Tensors (tag_id, step) + CREATE UNIQUE INDEX IF NOT EXISTS + TensorSeriesStepIndex + ON + Tensors (series, step) + WHERE + series IS NOT NULL + AND step IS NOT NULL )sql")); - // Creates TensorChunks table. + // TensorStrings are the flat contents of 1..n dimensional DT_STRING + // Tensors. // - // This table can be used to split up a tensor across many rows, - // which has the advantage of not slowing down table scans on the - // main table, allowing asynchronous fetching, minimizing copying, - // and preventing large buffers from being allocated. + // The number of rows associated with a Tensor must be equal to the + // product of its Tensors.shape. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. - // tag_id: ID of associated Tag. - // step: Same as corresponding Tensors.step. - // sequence: 1-indexed sequence number for ordering chunks. Please - // note that the 0th index is Tensors.tensor. - // chunk: Bytes of next chunk in tensor. + // rowid: Ephemeral b-tree ID. + // tensor_rowid: References Tensors.rowid. + // idx: Index in flattened tensor, starting at 0. + // data: The string value at a particular index. NUL characters are + // permitted. s.Update(Run(db, R"sql( - CREATE TABLE IF NOT EXISTS TensorChunks ( + CREATE TABLE IF NOT EXISTS TensorStrings ( rowid INTEGER PRIMARY KEY, - tag_id INTEGER NOT NULL, - step INTEGER, - sequence INTEGER, - chunk BLOB + tensor_rowid INTEGER NOT NULL, + idx INTEGER NOT NULL, + data BLOB ) )sql")); - // Uniquely indexes (tag_id, step, sequence) on TensorChunks table. s.Update(Run(db, R"sql( - CREATE UNIQUE INDEX IF NOT EXISTS TensorChunkIndex - ON TensorChunks (tag_id, step, sequence) + CREATE UNIQUE INDEX IF NOT EXISTS TensorStringIndex + ON TensorStrings (tensor_rowid, idx) )sql")); - // Creates Tags table. + // Tags are series of Tensors. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. + // rowid: Ephemeral b-tree ID. // tag_id: The Permanent ID of the Tag. // run_id: Optional ID of associated Run. - // tag_name: The tag field in summary.proto, unique across Run. // inserted_time: Float UNIX timestamp with µs precision. This is // always the wall time of when the row was inserted into the // DB. It may be used as a hint for an archival job. + // tag_name: The tag field in summary.proto, unique across Run. // display_name: Optional for GUI and defaults to tag_name. // plugin_name: Arbitrary TensorBoard plugin name for dispatch. // plugin_data: Arbitrary data that plugin wants. + // + // TODO(jart): Maybe there should be a Plugins table? s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS Tags ( rowid INTEGER PRIMARY KEY, run_id INTEGER, tag_id INTEGER NOT NULL, - tag_name TEXT, inserted_time DOUBLE, + tag_name TEXT, display_name TEXT, plugin_name TEXT, plugin_data BLOB ) )sql")); - // Uniquely indexes tag_id on Tags table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS TagIdIndex ON Tags (tag_id) )sql")); - // Uniquely indexes (run_id, tag_name) on Tags table. s.Update(Run(db, R"sql( - CREATE UNIQUE INDEX IF NOT EXISTS TagNameIndex - ON Tags (run_id, tag_name) - WHERE tag_name IS NOT NULL + CREATE UNIQUE INDEX IF NOT EXISTS + TagRunNameIndex + ON + Tags (run_id, tag_name) + WHERE + run_id IS NOT NULL + AND tag_name IS NOT NULL )sql")); - // Creates Runs table. + // Runs are groups of Tags. // - // This table stores information about Runs. Each row usually - // represents a single attempt at training or testing a TensorFlow - // model, with a given set of hyper-parameters, whose summaries are - // written out to a single event logs directory with a monotonic step - // counter. + // Each Run usually represents a single attempt at training or testing + // a TensorFlow model, with a given set of hyper-parameters, whose + // summaries are written out to a single event logs directory with a + // monotonic step counter. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. + // rowid: Ephemeral b-tree ID. // run_id: The Permanent ID of the Run. This has a 1:1 mapping // with a SummaryWriter instance. If two writers spawn for a // given (user_name, run_name, run_name) then each should @@ -199,8 +210,8 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { // previous invocations will then enter limbo, where they may be // accessible for certain operations, but should be garbage // collected eventually. - // experiment_id: Optional ID of associated Experiment. // run_name: User-supplied string, unique across Experiment. + // experiment_id: Optional ID of associated Experiment. // inserted_time: Float UNIX timestamp with µs precision. This is // always the time the row was inserted into the database. It // does not change. @@ -215,40 +226,33 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { // SummaryWriter resource that created this run was destroyed. // Once this value becomes non-NULL a Run and its Tags and // Tensors should be regarded as immutable. - // graph_id: ID of associated Graphs row. s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS Runs ( rowid INTEGER PRIMARY KEY, experiment_id INTEGER, run_id INTEGER NOT NULL, - run_name TEXT, inserted_time REAL, started_time REAL, finished_time REAL, - graph_id INTEGER + run_name TEXT ) )sql")); - // Uniquely indexes run_id on Runs table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS RunIdIndex ON Runs (run_id) )sql")); - // Uniquely indexes (experiment_id, run_name) on Runs table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS RunNameIndex ON Runs (experiment_id, run_name) WHERE run_name IS NOT NULL )sql")); - // Creates Experiments table. - // - // This table stores information about experiments, which are sets of - // runs. + // Experiments are groups of Runs. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. + // rowid: Ephemeral b-tree ID. // user_id: Optional ID of associated User. // experiment_id: The Permanent ID of the Experiment. // experiment_name: User-supplied string, unique across User. @@ -259,34 +263,39 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { // the MIN(experiment.started_time, run.started_time) of each // Run added to the database, including Runs which have since // been overwritten. + // is_watching: A boolean indicating if someone is actively + // looking at this Experiment in the TensorBoard GUI. Tensor + // writers that do reservoir sampling can query this value to + // decide if they want the "keep last" behavior. This improves + // the performance of long running training while allowing low + // latency feedback in TensorBoard. s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS Experiments ( rowid INTEGER PRIMARY KEY, user_id INTEGER, experiment_id INTEGER NOT NULL, - experiment_name TEXT, inserted_time REAL, - started_time REAL + started_time REAL, + is_watching INTEGER, + experiment_name TEXT ) )sql")); - // Uniquely indexes experiment_id on Experiments table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS ExperimentIdIndex ON Experiments (experiment_id) )sql")); - // Uniquely indexes (user_id, experiment_name) on Experiments table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS ExperimentNameIndex ON Experiments (user_id, experiment_name) WHERE experiment_name IS NOT NULL )sql")); - // Creates Users table. + // Users are people who love TensorBoard. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. + // rowid: Ephemeral b-tree ID. // user_id: The Permanent ID of the User. // user_name: Unique user name. // email: Optional unique email address. @@ -297,61 +306,66 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { CREATE TABLE IF NOT EXISTS Users ( rowid INTEGER PRIMARY KEY, user_id INTEGER NOT NULL, + inserted_time REAL, user_name TEXT, - email TEXT, - inserted_time REAL + email TEXT ) )sql")); - // Uniquely indexes user_id on Users table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS UserIdIndex ON Users (user_id) )sql")); - // Uniquely indexes user_name on Users table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS UserNameIndex ON Users (user_name) WHERE user_name IS NOT NULL )sql")); - // Uniquely indexes email on Users table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS UserEmailIndex ON Users (email) WHERE email IS NOT NULL )sql")); - // Creates Graphs table. + // Graphs define how Tensors flowed in Runs. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. + // rowid: Ephemeral b-tree ID. + // run_id: The Permanent ID of the associated Run. Only one Graph + // can be associated with a Run. // graph_id: The Permanent ID of the Graph. // inserted_time: Float UNIX timestamp with µs precision. This is // always the wall time of when the row was inserted into the // DB. It may be used as a hint for an archival job. - // node_def: Contains Snappy tf.GraphDef proto. All fields will be - // cleared except those not expressed in SQL. + // node_def: Contains tf.GraphDef proto. All fields will be cleared + // except those not expressed in SQL. s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS Graphs ( rowid INTEGER PRIMARY KEY, + run_id INTEGER, graph_id INTEGER NOT NULL, inserted_time REAL, graph_def BLOB ) )sql")); - // Uniquely indexes graph_id on Graphs table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS GraphIdIndex ON Graphs (graph_id) )sql")); - // Creates Nodes table. + s.Update(Run(db, R"sql( + CREATE UNIQUE INDEX IF NOT EXISTS GraphRunIndex + ON Graphs (run_id) + WHERE run_id IS NOT NULL + )sql")); + + // Nodes are the vertices in Graphs. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. + // rowid: Ephemeral b-tree ID. // graph_id: The Permanent ID of the associated Graph. // node_id: ID for this node. This is more like a 0-index within // the Graph. Please note indexes are allowed to be removed. @@ -361,8 +375,10 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { // node_def.name proto field must not be cleared. // op: Copied from tf.NodeDef proto. // device: Copied from tf.NodeDef proto. - // node_def: Contains Snappy tf.NodeDef proto. All fields will be - // cleared except those not expressed in SQL. + // node_def: Contains tf.NodeDef proto. All fields will be cleared + // except those not expressed in SQL. + // + // TODO(jart): Make separate tables for op and device strings. s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS Nodes ( rowid INTEGER PRIMARY KEY, @@ -375,32 +391,35 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { ) )sql")); - // Uniquely indexes (graph_id, node_id) on Nodes table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS NodeIdIndex ON Nodes (graph_id, node_id) )sql")); - // Uniquely indexes (graph_id, node_name) on Nodes table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS NodeNameIndex ON Nodes (graph_id, node_name) WHERE node_name IS NOT NULL )sql")); - // Creates NodeInputs table. + // NodeInputs are directed edges between Nodes in Graphs. // // Fields: - // rowid: Ephemeral b-tree ID dictating locality. + // rowid: Ephemeral b-tree ID. // graph_id: The Permanent ID of the associated Graph. // node_id: Index of Node in question. This can be considered the // 'to' vertex. // idx: Used for ordering inputs on a given Node. // input_node_id: Nodes.node_id of the corresponding input node. // This can be considered the 'from' vertex. + // input_node_idx: Since a Node can output multiple Tensors, this + // is the integer index of which of those outputs is our input. + // NULL is treated as 0. // is_control: If non-zero, indicates this input is a controlled // dependency, which means this isn't an edge through which // tensors flow. NULL means 0. + // + // TODO(jart): Rename to NodeEdges. s.Update(Run(db, R"sql( CREATE TABLE IF NOT EXISTS NodeInputs ( rowid INTEGER PRIMARY KEY, @@ -408,11 +427,11 @@ Status SetupTensorboardSqliteDb(Sqlite* db) { node_id INTEGER NOT NULL, idx INTEGER NOT NULL, input_node_id INTEGER NOT NULL, + input_node_idx INTEGER, is_control INTEGER ) )sql")); - // Uniquely indexes (graph_id, node_id, idx) on NodeInputs table. s.Update(Run(db, R"sql( CREATE UNIQUE INDEX IF NOT EXISTS NodeInputsIndex ON NodeInputs (graph_id, node_id, idx) |