diff options
author | 2017-03-21 16:46:23 -0800 | |
---|---|---|
committer | 2017-03-21 18:10:18 -0700 | |
commit | 14dcad8348d74773331d27edfa3a1a88faa91cea (patch) | |
tree | 2a57f3f0848bde39528ff66c4e461eb9138a28ed | |
parent | 0eafa2485ebde630760602e5dac8895eedce27b9 (diff) |
Stream parse tensors and metadata TSV files to support files > 256MB.
Change: 150827900
4 files changed, 226 insertions, 52 deletions
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts index bf1bc0b255..57e549c2f2 100644 --- a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts +++ b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts @@ -76,15 +76,19 @@ export class DemoDataProvider implements DataProvider { callback); } else { logging.setModalMessage('Fetching tensors...', TENSORS_MSG_ID); - d3.text(url, (error: any, dataString: string) => { - if (error) { - logging.setErrorMessage(error.responseText, 'fetching tensors'); - return; - } - dataProvider.parseTensors(dataString).then(points => { + const request = new XMLHttpRequest(); + request.open('GET', url); + request.responseType = 'arraybuffer'; + + request.onerror = () => { + logging.setErrorMessage(request.responseText, 'fetching tensors'); + }; + request.onload = () => { + dataProvider.parseTensors(request.response).then(points => { callback(new DataSet(points)); }); - }); + }; + request.send(); } } diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider.ts b/tensorflow/tensorboard/components/vz_projector/data-provider.ts index 4db042d7fd..3acc5a4374 100644 --- a/tensorflow/tensorboard/components/vz_projector/data-provider.ts +++ b/tensorflow/tensorboard/components/vz_projector/data-provider.ts @@ -131,30 +131,87 @@ export function retrieveTensorAsBytes( } export function parseRawTensors( - content: string, callback: (ds: DataSet) => void) { + content: ArrayBuffer, callback: (ds: DataSet) => void) { parseTensors(content).then(data => { callback(new DataSet(data)); }); } export function parseRawMetadata( - contents: string, callback: (r: SpriteAndMetadataInfo) => void) { + contents: ArrayBuffer, callback: (r: SpriteAndMetadataInfo) => void) { parseMetadata(contents).then(result => callback(result)); } +/** + * Parse an ArrayBuffer in a streaming fashion line by line (or custom delim). + * Can handle very large files. + * + * @param content The array buffer. + * @param callback The callback called on each line. + * @param chunkSize The size of each read chunk, defaults to ~1MB. (optional) + * @param delim The delimiter used to split a line, defaults to '\n'. (optional) + * @returns A promise for when it is finished. + */ +function streamParse( + content: ArrayBuffer, callback: (line: string) => void, chunkSize = 1000000, + delim = '\n'): Promise<void> { + return new Promise<void>((resolve, reject) => { + let offset = 0; + let bufferSize = content.byteLength - 1; + let data = ''; + + function readHandler(str) { + offset += chunkSize; + let parts = str.split(delim); + let first = data + parts[0]; + if (parts.length === 1) { + data = first; + readChunk(offset, chunkSize); + return; + } + data = parts[parts.length - 1]; + callback(first); + for (let i = 1; i < parts.length - 1; i++) { + callback(parts[i]); + } + if (offset >= bufferSize) { + if (data) { + callback(data); + } + resolve(); + return; + } + readChunk(offset, chunkSize); + } + + function readChunk(offset: number, size: number) { + const contentChunk = content.slice(offset, offset + size); + + const blob = new Blob([contentChunk]); + const file = new FileReader(); + file.onload = (e: any) => readHandler(e.target.result); + file.readAsText(blob); + } + + readChunk(offset, chunkSize); + }); +} + /** Parses a tsv text file. */ export function parseTensors( - content: string, delim = '\t'): Promise<DataPoint[]> { - let data: DataPoint[] = []; - let numDim: number; - return runAsyncTask('Parsing tensors...', () => { - let lines = content.split('\n'); - lines.forEach(line => { + content: ArrayBuffer, valueDelim = '\t'): Promise<DataPoint[]> { + logging.setModalMessage('Parsing tensors...', TENSORS_MSG_ID); + + return new Promise<DataPoint[]>((resolve, reject) => { + let data: DataPoint[] = []; + let numDim: number; + + streamParse(content, (line: string) => { line = line.trim(); if (line === '') { return; } - let row = line.split(delim); + let row = line.split(valueDelim); let dataPoint: DataPoint = { metadata: {}, vector: null, @@ -182,11 +239,10 @@ export function parseTensors( 'Parsing failed. Found a vector with only one dimension?'); throw Error('Parsing failed'); } + }).then(() => { + logging.setModalMessage(null, TENSORS_MSG_ID); + resolve(data); }); - return data; - }, TENSORS_MSG_ID).then(dataPoints => { - logging.setModalMessage(null, TENSORS_MSG_ID); - return dataPoints; }); } @@ -263,19 +319,33 @@ export function analyzeMetadata( return columnStats; } -export function parseMetadata(content: string): Promise<SpriteAndMetadataInfo> { - return runAsyncTask('Parsing metadata...', () => { - let lines = content.split('\n').filter(line => line.trim().length > 0); - let hasHeader = lines[0].indexOf('\t') >= 0; +export function parseMetadata(content: ArrayBuffer): + Promise<SpriteAndMetadataInfo> { + logging.setModalMessage('Parsing metadata...', METADATA_MSG_ID); + + return new Promise<SpriteAndMetadataInfo>((resolve, reject) => { let pointsMetadata: PointMetadata[] = []; - // If the first row doesn't contain metadata keys, we assume that the values - // are labels. + let hasHeader = false; + let lineNumber = 0; let columnNames = ['label']; - if (hasHeader) { - columnNames = lines[0].split('\t'); - lines = lines.slice(1); - } - lines.forEach((line: string) => { + streamParse(content, (line: string) => { + if (line.trim().length === 0) { + return; + } + if (lineNumber === 0) { + hasHeader = line.indexOf('\t') >= 0; + + // If the first row doesn't contain metadata keys, we assume that the + // values are labels. + if (hasHeader) { + columnNames = line.split('\t'); + lineNumber++; + return; + } + } + + lineNumber++; + let rowValues = line.split('\t'); let metadata: PointMetadata = {}; pointsMetadata.push(metadata); @@ -285,14 +355,13 @@ export function parseMetadata(content: string): Promise<SpriteAndMetadataInfo> { value = (value === '' ? null : value); metadata[name] = value; }); + }).then(() => { + logging.setModalMessage(null, METADATA_MSG_ID); + resolve({ + stats: analyzeMetadata(columnNames, pointsMetadata), + pointsInfo: pointsMetadata + }); }); - return { - stats: analyzeMetadata(columnNames, pointsMetadata), - pointsInfo: pointsMetadata - } as SpriteAndMetadataInfo; - }, METADATA_MSG_ID).then(metadata => { - logging.setModalMessage(null, METADATA_MSG_ID); - return metadata; }); } @@ -313,14 +382,19 @@ export function retrieveSpriteAndMetadataInfo(metadataPath: string, if (metadataPath) { metadataPromise = new Promise<SpriteAndMetadataInfo>((resolve, reject) => { logging.setModalMessage('Fetching metadata...', METADATA_MSG_ID); - d3.text(metadataPath, (err: any, rawMetadata: string) => { - if (err) { - logging.setErrorMessage(err.responseText, 'fetching metadata'); - reject(err); - return; - } - resolve(parseMetadata(rawMetadata)); - }); + + const request = new XMLHttpRequest(); + request.open('GET', metadataPath); + request.responseType = 'arraybuffer'; + + request.onerror = () => { + logging.setErrorMessage(request.responseText, 'fetching metadata'); + reject(); + }; + request.onload = () => { + resolve(parseMetadata(request.response)); + }; + request.send(null); }); } let spriteMsgId = null; diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider_test.ts b/tensorflow/tensorboard/components/vz_projector/data-provider_test.ts new file mode 100644 index 0000000000..01b89ca700 --- /dev/null +++ b/tensorflow/tensorboard/components/vz_projector/data-provider_test.ts @@ -0,0 +1,96 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +import {DataPoint, SpriteAndMetadataInfo} from './data'; +import * as data_provider from './data-provider'; + +/** + * Converts a string to an ArrayBuffer. + */ +function stringToArrayBuffer(str: string): Promise<ArrayBuffer> { + return new Promise<ArrayBuffer>((resolve, reject) => { + let blob = new Blob([str]); + let file = new FileReader(); + file.onload = (e: any) => { + resolve(e.target.result); + }; + file.readAsArrayBuffer(blob); + }); +} + +/** + * Converts an data array to TSV format. + */ +function dataToTsv(data: string[][]|number[][]) { + let lines = []; + for (let i = 0; i < data.length; i++) { + lines.push(data[i].join('\t')); + } + return lines.join('\n'); +} + +describe('parse tensors', () => { + it('parseTensors', (doneFn) => { + let tensors = [[1.0, 2.0], [2.0, 3.0]]; + stringToArrayBuffer(dataToTsv(tensors)) + .then((tensorsArrayBuffer: ArrayBuffer) => { + data_provider.parseTensors(tensorsArrayBuffer) + .then((data: DataPoint[]) => { + expect(data.length).toBe(2); + + expect(data[0].vector).toEqual(new Float32Array(tensors[0])); + expect(data[0].index).toEqual(0); + expect(data[0].projections).toBeNull(); + + expect(data[1].vector).toEqual(new Float32Array(tensors[1])); + expect(data[1].index).toEqual(1); + expect(data[1].projections).toBeNull(); + doneFn(); + }); + }); + }); + it('parseMetadata', (doneFn) => { + let metadata = [['label', 'fakecol'], ['Г', '0'], ['label1', '1']]; + + stringToArrayBuffer(dataToTsv(metadata)) + .then((metadataArrayBuffer: ArrayBuffer) => { + data_provider.parseMetadata(metadataArrayBuffer) + .then((spriteAndMetadataInfo: SpriteAndMetadataInfo) => { + expect(spriteAndMetadataInfo.stats.length).toBe(2); + expect(spriteAndMetadataInfo.stats[0].name) + .toBe(metadata[0][0]); + expect(spriteAndMetadataInfo.stats[0].isNumeric).toBe(false); + expect(spriteAndMetadataInfo.stats[0].tooManyUniqueValues) + .toBe(false); + expect(spriteAndMetadataInfo.stats[1].name) + .toBe(metadata[0][1]); + expect(spriteAndMetadataInfo.stats[1].isNumeric).toBe(true); + expect(spriteAndMetadataInfo.stats[1].tooManyUniqueValues) + .toBe(false); + + expect(spriteAndMetadataInfo.pointsInfo.length).toBe(2); + expect(spriteAndMetadataInfo.pointsInfo[0]['label']) + .toBe(metadata[1][0]); + expect(spriteAndMetadataInfo.pointsInfo[0]['fakecol']) + .toBe(+metadata[1][1]); + expect(spriteAndMetadataInfo.pointsInfo[1]['label']) + .toBe(metadata[2][0]); + expect(spriteAndMetadataInfo.pointsInfo[1]['fakecol']) + .toBe(+metadata[2][1]); + doneFn(); + }); + }); + }); +}); diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts index b34d0f60ed..a56ea0f71f 100644 --- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts +++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts @@ -320,7 +320,7 @@ export class DataPanel extends DataPanelPolymer { this.projector.setSelectedColorOption(colorOption); } - private tensorWasReadFromFile(rawContents: string, fileName: string) { + private tensorWasReadFromFile(rawContents: ArrayBuffer, fileName: string) { parseRawTensors(rawContents, ds => { this.dom.select('#checkpoint-file') .text(fileName) @@ -329,7 +329,7 @@ export class DataPanel extends DataPanelPolymer { }); } - private metadataWasReadFromFile(rawContents: string, fileName: string) { + private metadataWasReadFromFile(rawContents: ArrayBuffer, fileName: string) { parseRawMetadata(rawContents, metadata => { this.projector.updateDataSet(this.projector.dataSet, metadata, fileName); }); @@ -354,10 +354,10 @@ export class DataPanel extends DataPanelPolymer { (d3.event as any).target.value = ''; let fileReader = new FileReader(); fileReader.onload = evt => { - let content: string = (evt.target as any).result; + let content: ArrayBuffer = (evt.target as any).result; this.tensorWasReadFromFile(content, file.name); }; - fileReader.readAsText(file); + fileReader.readAsArrayBuffer(file); }); let uploadButton = this.dom.select('#upload-tensors'); @@ -374,10 +374,10 @@ export class DataPanel extends DataPanelPolymer { (d3.event as any).target.value = ''; let fileReader = new FileReader(); fileReader.onload = evt => { - let contents: string = (evt.target as any).result; + let contents: ArrayBuffer = (evt.target as any).result; this.metadataWasReadFromFile(contents, file.name); }; - fileReader.readAsText(file); + fileReader.readAsArrayBuffer(file); }); let uploadMetadataButton = this.dom.select('#upload-metadata'); |