# Copyright 2015-present The Scikit Flow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Example of Estimator for CNN-based text classification with DBpedia data.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np from sklearn import metrics import pandas import tensorflow as tf from tensorflow.contrib import learn FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_bool('test_with_fake_data', False, 'Test the example code with fake data.') MAX_DOCUMENT_LENGTH = 100 EMBEDDING_SIZE = 20 N_FILTERS = 10 WINDOW_SIZE = 20 FILTER_SHAPE1 = [WINDOW_SIZE, EMBEDDING_SIZE] FILTER_SHAPE2 = [WINDOW_SIZE, N_FILTERS] POOLING_WINDOW = 4 POOLING_STRIDE = 2 n_words = 0 def cnn_model(x, y): """2 layer Convolutional network to predict from sequence of words to a class.""" # Convert indexes of words into embeddings. # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then # maps word indexes of the sequence into [batch_size, sequence_length, # EMBEDDING_SIZE]. y = tf.one_hot(y, 15, 1, 0) word_vectors = learn.ops.categorical_variable(x, n_classes=n_words, embedding_size=EMBEDDING_SIZE, name='words') word_vectors = tf.expand_dims(word_vectors, 3) with tf.variable_scope('CNN_Layer1'): # Apply Convolution filtering on input sequence. conv1 = tf.contrib.layers.convolution2d(word_vectors, N_FILTERS, FILTER_SHAPE1, padding='VALID') # Add a RELU for non linearity. conv1 = tf.nn.relu(conv1) # Max pooling across output of Convolution+Relu. pool1 = tf.nn.max_pool(conv1, ksize=[1, POOLING_WINDOW, 1, 1], strides=[1, POOLING_STRIDE, 1, 1], padding='SAME') # Transpose matrix so that n_filters from convolution becomes width. pool1 = tf.transpose(pool1, [0, 1, 3, 2]) with tf.variable_scope('CNN_Layer2'): # Second level of convolution filtering. conv2 = tf.contrib.layers.convolution2d(pool1, N_FILTERS, FILTER_SHAPE2, padding='VALID') # Max across each filter to get useful features for classification. pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1]) # Apply regular WX + B and classification. prediction, loss = learn.models.logistic_regression(pool2, y) train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op def main(unused_argv): global n_words # Prepare training and testing data dbpedia = learn.datasets.load_dataset( 'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data) x_train = pandas.DataFrame(dbpedia.train.data)[1] y_train = pandas.Series(dbpedia.train.target) x_test = pandas.DataFrame(dbpedia.test.data)[1] y_test = pandas.Series(dbpedia.test.target) # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Build model classifier = learn.Estimator(model_fn=cnn_model) # Train and predict classifier.fit(x_train, y_train, steps=100) y_predicted = classifier.predict(x_test) score = metrics.accuracy_score(y_test, y_predicted['class']) print('Accuracy: {0:f}'.format(score)) if __name__ == '__main__': tf.app.run()