aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Nikhil Thorat <nsthorat@google.com>2017-03-21 16:46:23 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-03-21 18:10:18 -0700
commit14dcad8348d74773331d27edfa3a1a88faa91cea (patch)
tree2a57f3f0848bde39528ff66c4e461eb9138a28ed
parent0eafa2485ebde630760602e5dac8895eedce27b9 (diff)
Stream parse tensors and metadata TSV files to support files > 256MB.
Change: 150827900
-rw-r--r--tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts18
-rw-r--r--tensorflow/tensorboard/components/vz_projector/data-provider.ts152
-rw-r--r--tensorflow/tensorboard/components/vz_projector/data-provider_test.ts96
-rw-r--r--tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts12
4 files changed, 226 insertions, 52 deletions
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
index bf1bc0b255..57e549c2f2 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
@@ -76,15 +76,19 @@ export class DemoDataProvider implements DataProvider {
callback);
} else {
logging.setModalMessage('Fetching tensors...', TENSORS_MSG_ID);
- d3.text(url, (error: any, dataString: string) => {
- if (error) {
- logging.setErrorMessage(error.responseText, 'fetching tensors');
- return;
- }
- dataProvider.parseTensors(dataString).then(points => {
+ const request = new XMLHttpRequest();
+ request.open('GET', url);
+ request.responseType = 'arraybuffer';
+
+ request.onerror = () => {
+ logging.setErrorMessage(request.responseText, 'fetching tensors');
+ };
+ request.onload = () => {
+ dataProvider.parseTensors(request.response).then(points => {
callback(new DataSet(points));
});
- });
+ };
+ request.send();
}
}
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider.ts b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
index 4db042d7fd..3acc5a4374 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
@@ -131,30 +131,87 @@ export function retrieveTensorAsBytes(
}
export function parseRawTensors(
- content: string, callback: (ds: DataSet) => void) {
+ content: ArrayBuffer, callback: (ds: DataSet) => void) {
parseTensors(content).then(data => {
callback(new DataSet(data));
});
}
export function parseRawMetadata(
- contents: string, callback: (r: SpriteAndMetadataInfo) => void) {
+ contents: ArrayBuffer, callback: (r: SpriteAndMetadataInfo) => void) {
parseMetadata(contents).then(result => callback(result));
}
+/**
+ * Parse an ArrayBuffer in a streaming fashion line by line (or custom delim).
+ * Can handle very large files.
+ *
+ * @param content The array buffer.
+ * @param callback The callback called on each line.
+ * @param chunkSize The size of each read chunk, defaults to ~1MB. (optional)
+ * @param delim The delimiter used to split a line, defaults to '\n'. (optional)
+ * @returns A promise for when it is finished.
+ */
+function streamParse(
+ content: ArrayBuffer, callback: (line: string) => void, chunkSize = 1000000,
+ delim = '\n'): Promise<void> {
+ return new Promise<void>((resolve, reject) => {
+ let offset = 0;
+ let bufferSize = content.byteLength - 1;
+ let data = '';
+
+ function readHandler(str) {
+ offset += chunkSize;
+ let parts = str.split(delim);
+ let first = data + parts[0];
+ if (parts.length === 1) {
+ data = first;
+ readChunk(offset, chunkSize);
+ return;
+ }
+ data = parts[parts.length - 1];
+ callback(first);
+ for (let i = 1; i < parts.length - 1; i++) {
+ callback(parts[i]);
+ }
+ if (offset >= bufferSize) {
+ if (data) {
+ callback(data);
+ }
+ resolve();
+ return;
+ }
+ readChunk(offset, chunkSize);
+ }
+
+ function readChunk(offset: number, size: number) {
+ const contentChunk = content.slice(offset, offset + size);
+
+ const blob = new Blob([contentChunk]);
+ const file = new FileReader();
+ file.onload = (e: any) => readHandler(e.target.result);
+ file.readAsText(blob);
+ }
+
+ readChunk(offset, chunkSize);
+ });
+}
+
/** Parses a tsv text file. */
export function parseTensors(
- content: string, delim = '\t'): Promise<DataPoint[]> {
- let data: DataPoint[] = [];
- let numDim: number;
- return runAsyncTask('Parsing tensors...', () => {
- let lines = content.split('\n');
- lines.forEach(line => {
+ content: ArrayBuffer, valueDelim = '\t'): Promise<DataPoint[]> {
+ logging.setModalMessage('Parsing tensors...', TENSORS_MSG_ID);
+
+ return new Promise<DataPoint[]>((resolve, reject) => {
+ let data: DataPoint[] = [];
+ let numDim: number;
+
+ streamParse(content, (line: string) => {
line = line.trim();
if (line === '') {
return;
}
- let row = line.split(delim);
+ let row = line.split(valueDelim);
let dataPoint: DataPoint = {
metadata: {},
vector: null,
@@ -182,11 +239,10 @@ export function parseTensors(
'Parsing failed. Found a vector with only one dimension?');
throw Error('Parsing failed');
}
+ }).then(() => {
+ logging.setModalMessage(null, TENSORS_MSG_ID);
+ resolve(data);
});
- return data;
- }, TENSORS_MSG_ID).then(dataPoints => {
- logging.setModalMessage(null, TENSORS_MSG_ID);
- return dataPoints;
});
}
@@ -263,19 +319,33 @@ export function analyzeMetadata(
return columnStats;
}
-export function parseMetadata(content: string): Promise<SpriteAndMetadataInfo> {
- return runAsyncTask('Parsing metadata...', () => {
- let lines = content.split('\n').filter(line => line.trim().length > 0);
- let hasHeader = lines[0].indexOf('\t') >= 0;
+export function parseMetadata(content: ArrayBuffer):
+ Promise<SpriteAndMetadataInfo> {
+ logging.setModalMessage('Parsing metadata...', METADATA_MSG_ID);
+
+ return new Promise<SpriteAndMetadataInfo>((resolve, reject) => {
let pointsMetadata: PointMetadata[] = [];
- // If the first row doesn't contain metadata keys, we assume that the values
- // are labels.
+ let hasHeader = false;
+ let lineNumber = 0;
let columnNames = ['label'];
- if (hasHeader) {
- columnNames = lines[0].split('\t');
- lines = lines.slice(1);
- }
- lines.forEach((line: string) => {
+ streamParse(content, (line: string) => {
+ if (line.trim().length === 0) {
+ return;
+ }
+ if (lineNumber === 0) {
+ hasHeader = line.indexOf('\t') >= 0;
+
+ // If the first row doesn't contain metadata keys, we assume that the
+ // values are labels.
+ if (hasHeader) {
+ columnNames = line.split('\t');
+ lineNumber++;
+ return;
+ }
+ }
+
+ lineNumber++;
+
let rowValues = line.split('\t');
let metadata: PointMetadata = {};
pointsMetadata.push(metadata);
@@ -285,14 +355,13 @@ export function parseMetadata(content: string): Promise<SpriteAndMetadataInfo> {
value = (value === '' ? null : value);
metadata[name] = value;
});
+ }).then(() => {
+ logging.setModalMessage(null, METADATA_MSG_ID);
+ resolve({
+ stats: analyzeMetadata(columnNames, pointsMetadata),
+ pointsInfo: pointsMetadata
+ });
});
- return {
- stats: analyzeMetadata(columnNames, pointsMetadata),
- pointsInfo: pointsMetadata
- } as SpriteAndMetadataInfo;
- }, METADATA_MSG_ID).then(metadata => {
- logging.setModalMessage(null, METADATA_MSG_ID);
- return metadata;
});
}
@@ -313,14 +382,19 @@ export function retrieveSpriteAndMetadataInfo(metadataPath: string,
if (metadataPath) {
metadataPromise = new Promise<SpriteAndMetadataInfo>((resolve, reject) => {
logging.setModalMessage('Fetching metadata...', METADATA_MSG_ID);
- d3.text(metadataPath, (err: any, rawMetadata: string) => {
- if (err) {
- logging.setErrorMessage(err.responseText, 'fetching metadata');
- reject(err);
- return;
- }
- resolve(parseMetadata(rawMetadata));
- });
+
+ const request = new XMLHttpRequest();
+ request.open('GET', metadataPath);
+ request.responseType = 'arraybuffer';
+
+ request.onerror = () => {
+ logging.setErrorMessage(request.responseText, 'fetching metadata');
+ reject();
+ };
+ request.onload = () => {
+ resolve(parseMetadata(request.response));
+ };
+ request.send(null);
});
}
let spriteMsgId = null;
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider_test.ts b/tensorflow/tensorboard/components/vz_projector/data-provider_test.ts
new file mode 100644
index 0000000000..01b89ca700
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider_test.ts
@@ -0,0 +1,96 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {DataPoint, SpriteAndMetadataInfo} from './data';
+import * as data_provider from './data-provider';
+
+/**
+ * Converts a string to an ArrayBuffer.
+ */
+function stringToArrayBuffer(str: string): Promise<ArrayBuffer> {
+ return new Promise<ArrayBuffer>((resolve, reject) => {
+ let blob = new Blob([str]);
+ let file = new FileReader();
+ file.onload = (e: any) => {
+ resolve(e.target.result);
+ };
+ file.readAsArrayBuffer(blob);
+ });
+}
+
+/**
+ * Converts an data array to TSV format.
+ */
+function dataToTsv(data: string[][]|number[][]) {
+ let lines = [];
+ for (let i = 0; i < data.length; i++) {
+ lines.push(data[i].join('\t'));
+ }
+ return lines.join('\n');
+}
+
+describe('parse tensors', () => {
+ it('parseTensors', (doneFn) => {
+ let tensors = [[1.0, 2.0], [2.0, 3.0]];
+ stringToArrayBuffer(dataToTsv(tensors))
+ .then((tensorsArrayBuffer: ArrayBuffer) => {
+ data_provider.parseTensors(tensorsArrayBuffer)
+ .then((data: DataPoint[]) => {
+ expect(data.length).toBe(2);
+
+ expect(data[0].vector).toEqual(new Float32Array(tensors[0]));
+ expect(data[0].index).toEqual(0);
+ expect(data[0].projections).toBeNull();
+
+ expect(data[1].vector).toEqual(new Float32Array(tensors[1]));
+ expect(data[1].index).toEqual(1);
+ expect(data[1].projections).toBeNull();
+ doneFn();
+ });
+ });
+ });
+ it('parseMetadata', (doneFn) => {
+ let metadata = [['label', 'fakecol'], ['Г', '0'], ['label1', '1']];
+
+ stringToArrayBuffer(dataToTsv(metadata))
+ .then((metadataArrayBuffer: ArrayBuffer) => {
+ data_provider.parseMetadata(metadataArrayBuffer)
+ .then((spriteAndMetadataInfo: SpriteAndMetadataInfo) => {
+ expect(spriteAndMetadataInfo.stats.length).toBe(2);
+ expect(spriteAndMetadataInfo.stats[0].name)
+ .toBe(metadata[0][0]);
+ expect(spriteAndMetadataInfo.stats[0].isNumeric).toBe(false);
+ expect(spriteAndMetadataInfo.stats[0].tooManyUniqueValues)
+ .toBe(false);
+ expect(spriteAndMetadataInfo.stats[1].name)
+ .toBe(metadata[0][1]);
+ expect(spriteAndMetadataInfo.stats[1].isNumeric).toBe(true);
+ expect(spriteAndMetadataInfo.stats[1].tooManyUniqueValues)
+ .toBe(false);
+
+ expect(spriteAndMetadataInfo.pointsInfo.length).toBe(2);
+ expect(spriteAndMetadataInfo.pointsInfo[0]['label'])
+ .toBe(metadata[1][0]);
+ expect(spriteAndMetadataInfo.pointsInfo[0]['fakecol'])
+ .toBe(+metadata[1][1]);
+ expect(spriteAndMetadataInfo.pointsInfo[1]['label'])
+ .toBe(metadata[2][0]);
+ expect(spriteAndMetadataInfo.pointsInfo[1]['fakecol'])
+ .toBe(+metadata[2][1]);
+ doneFn();
+ });
+ });
+ });
+});
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
index b34d0f60ed..a56ea0f71f 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
@@ -320,7 +320,7 @@ export class DataPanel extends DataPanelPolymer {
this.projector.setSelectedColorOption(colorOption);
}
- private tensorWasReadFromFile(rawContents: string, fileName: string) {
+ private tensorWasReadFromFile(rawContents: ArrayBuffer, fileName: string) {
parseRawTensors(rawContents, ds => {
this.dom.select('#checkpoint-file')
.text(fileName)
@@ -329,7 +329,7 @@ export class DataPanel extends DataPanelPolymer {
});
}
- private metadataWasReadFromFile(rawContents: string, fileName: string) {
+ private metadataWasReadFromFile(rawContents: ArrayBuffer, fileName: string) {
parseRawMetadata(rawContents, metadata => {
this.projector.updateDataSet(this.projector.dataSet, metadata, fileName);
});
@@ -354,10 +354,10 @@ export class DataPanel extends DataPanelPolymer {
(d3.event as any).target.value = '';
let fileReader = new FileReader();
fileReader.onload = evt => {
- let content: string = (evt.target as any).result;
+ let content: ArrayBuffer = (evt.target as any).result;
this.tensorWasReadFromFile(content, file.name);
};
- fileReader.readAsText(file);
+ fileReader.readAsArrayBuffer(file);
});
let uploadButton = this.dom.select('#upload-tensors');
@@ -374,10 +374,10 @@ export class DataPanel extends DataPanelPolymer {
(d3.event as any).target.value = '';
let fileReader = new FileReader();
fileReader.onload = evt => {
- let contents: string = (evt.target as any).result;
+ let contents: ArrayBuffer = (evt.target as any).result;
this.metadataWasReadFromFile(contents, file.name);
};
- fileReader.readAsText(file);
+ fileReader.readAsArrayBuffer(file);
});
let uploadMetadataButton = this.dom.select('#upload-metadata');