# encoding: utf-8

#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import cPickle
import itertools
import os
import random

import numpy as np
import tensorflow as tf

from tensorflow.contrib import learn

# Get training data

# This dataset can be downloaded from http://www.statmt.org/europarl/v6/fr-en.tgz

ENGLISH_CORPUS = "europarl-v6.fr-en.en"
FRENCH_CORPUS = "europarl-v6.fr-en.fr"

def read_iterator(filename, reporting=True):
    f = open(filename)
    line_count = 0
    for line in f:
        line_count += 1
        if reporting and line_count % 100000 == 0:
            print("%d lines read from %s" % (line_count, filename))
        yield line.strip()


def repeated_read_iterator(filename):
    while True:
        f = open(filename)
        for line in f:
            yield line.strip()


def split_train_test(data, partition=0.2, random_seed=42):
    rnd = np.random.RandomState(random_seed)
    for item in data:
        if rnd.uniform() > partition:
            yield (0, item)
        else:
            yield (1, item)


def save_partitions(data, filenames):
    files = [open(filename, 'w') for filename in filenames]
    for partition, item in data:
        files[partition].write(item + '\n')


def loop_iterator(data):
    while True:
        for item in data:
            yield item


if not (os.path.exists('train.data') and os.path.exists('test.data')):
    english_data = read_iterator(ENGLISH_CORPUS)
    french_data = read_iterator(FRENCH_CORPUS)
    parallel_data = ('%s;;;%s' % (eng, fr) for eng, fr in itertools.izip(english_data, french_data))
    save_partitions(split_train_test(parallel_data), ['train.data', 'test.data'])

def Xy(data):
    def split_lines(data):
        for item in data:
            yield item.split(';;;')
    X, y = itertools.tee(split_lines(data))
    return (item[0] for item in X), (item[1] for item in y)

X_train, y_train = Xy(repeated_read_iterator('train.data'))
X_test, y_test = Xy(read_iterator('test.data'))

# Preprocessing

MAX_DOCUMENT_LENGTH = 10

if not (os.path.exists('en.vocab') and os.path.exists('fr.vocab')):
    X_vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH,
        min_frequency=5)
    y_vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH,
        min_frequency=5)
    Xtrainff, ytrainff = Xy(read_iterator('train.data'))
    print('Fitting dictionary for English...')
    X_vocab_processor.fit(Xtrainff)
    print('Fitting dictionary for French...')
    y_vocab_processor.fit(ytrainff)
    open('en.vocab', 'w').write(cPickle.dumps(X_vocab_processor))
    open('fr.vocab', 'w').write(cPickle.dumps(y_vocab_processor))
else:
    X_vocab_processor = cPickle.loads(open('en.vocab').read())
    y_vocab_processor = cPickle.loads(open('fr.vocab').read())
print('Transforming...')
X_train = X_vocab_processor.transform(X_train)
y_train = y_vocab_processor.transform(y_train)
X_test = X_vocab_processor.transform(X_test)

# TODO: Expand this to use the whole test set.
X_test = np.array([X_test.next() for _ in range(1000)])
y_test = [y_test.next() for _ in range(1000)]

n_en_words = len(X_vocab_processor.vocabulary_)
n_fr_words = len(y_vocab_processor.vocabulary_)
print('Total words, en: %d, fr: %d' % (n_en_words, n_fr_words))

# Translation model

HIDDEN_SIZE = 20
EMBEDDING_SIZE = 20

def translate_model(X, y):
    word_vectors = learn.ops.categorical_variable(X, n_classes=n_en_words,
        embedding_size=EMBEDDING_SIZE, name='words')
    in_X, in_y, out_y = learn.ops.seq2seq_inputs(
        word_vectors, y, MAX_DOCUMENT_LENGTH, MAX_DOCUMENT_LENGTH)
    encoder_cell = tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE)
    decoder_cell = tf.nn.rnn_cell.OutputProjectionWrapper(
        tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE), n_fr_words)
    decoding, _, sampling_decoding, _ = learn.ops.rnn_seq2seq(in_X, in_y,
        encoder_cell, decoder_cell=decoder_cell)
    return learn.ops.sequence_classifier(decoding, out_y, sampling_decoding)


PATH = '/tmp/tf_examples/ntm_words/'

if os.path.exists(os.path.join(PATH, 'graph.pbtxt')):
    translator = learn.TensorFlowEstimator.restore(PATH)
else:
    translator = learn.TensorFlowEstimator(model_fn=translate_model,
        n_classes=n_fr_words,
        optimizer='Adam', learning_rate=0.01, batch_size=128,
        continue_training=True, steps=100)

while True:
    translator.fit(X_train, y_train, logdir=PATH)
    translator.save(PATH)

    xpred, ygold = [], []
    for _ in range(10):
        idx = random.randint(0, len(X_test) - 1)
        xpred.append(X_test[idx])
        ygold.append(y_test[idx])
    xpred = np.array(xpred)
    predictions = translator.predict(xpred, axis=2)
    xpred_inp = X_vocab_processor.reverse(xpred)
    text_outputs = y_vocab_processor.reverse(predictions)
    for inp_data, input_text, pred, output_text, gold in zip(xpred, xpred_inp,
        predictions, text_outputs, ygold):
        print('English: %s. French (pred): %s, French (gold): %s' %
            (input_text, output_text, gold))
        print(inp_data, pred)