-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbasic_rnn_models.py
executable file
·297 lines (243 loc) · 13.2 KB
/
basic_rnn_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# Copyright 2018 Saarland University, Spoken Language
# Systems LSV (author: Youssef Oualil, during his work period at LSV)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS*, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
#
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
###############################################################################
# Parts of this code are based on the Tensorflow PTB-LM recipe licensed under
# the Apache License, Version 2.0 by the TensorFlow Authors.
# (Source: https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/ptb_word_lm.py
# retrieved in January 2018)
###############################################################################
import time
import tensorflow as tf
import numpy as np
from tensorflow.contrib import rnn
from tensorflow.contrib import legacy_seq2seq
def data_type():
return tf.float32
class LM(object):
"""
This classe implements the basic RNN-LMs using the built-in Tensorflow cells.
In particular, this calss can be used to train vanilla-RNN, LSTM (with and
witout projection) and GRU.
"""
def __init__(self, config, training=True):
"""
The constructor of the RNN-LM. We define here the complete graph.
"""
# store the configuration for the future
self.config = config
# define the particular attributes of the basic RNN models
self.training = training
# bottleneck layer activation function
if config.activation == 'tanh':
self.activation = tf.nn.tanh
elif config.activation == 'sigmoid':
self.activation = tf.nn.sigmoid
elif config.activation == 'relu':
self.activation = tf.nn.relu
elif config.activation == 'elu':
self.activation = tf.nn.elu
elif config.activation == 'relu6':
self.activation = tf.nn.relu6
self.history_size = 1
self.model = config.model
self.init_method = config.init_method
self.num_layers = config.num_layers
self.input_keep_prob = config.input_keep_prob
self.output_keep_prob = config.output_keep_prob
self.vocab_size = config.vocab_size
self.use_peepholes = config.use_peepholes
# check consistencies in the LSTM parameters
if config.model == "lstm" or config.model == "lstmp":
if config.use_peepholes is True or config.lstmp_proj_size:
self.model = "lstmp"
else:
self.model = "lstm"
if self.model == "lstmp" and config.bottleneck_size:
print("[WARNING] you are using a bottleneck layer on the the top of an LSTMP model, "
"which includes an internal bottleneck (projection) layer...!")
self.embed_size = config.embed_size
# hidden size (layer): internal to the models (e.g., memory in LSTM).
self.hidden_state_size = config.hidden_size
# recurrent layer: layer that feeds back in time into the model.
self.recurrent_state_size = config.hidden_size
# last layer: layer right before the output layer (can be bottleneck or recurrent layer).
self.last_layer_size = config.hidden_size
if config.bottleneck_size:
self.last_layer_size = config.bottleneck_size
if self.model == "lstmp" and config.lstmp_proj_size:
self.recurrent_state_size = config.lstmp_proj_size
self.last_layer_size = config.lstmp_proj_size
if config.bottleneck_size:
self.last_layer_size = config.bottleneck_size
# ##############################################################
# ############# DEFINE THE PLACEHOLDERS ##############
# placeholder for the training input data and target words
self.input_data = tf.placeholder(
tf.int32, [config.batch_size, config.seq_length])
self.targets = tf.placeholder(
tf.int32, [config.batch_size, config.seq_length])
# ##############################################################
# ###### DEFINE TRAINABLE VARIABLES (WEIGHTS AND BIASES) #####
# define the initializer of embeddings, weights and biases
if self.init_method == "xavier":
initializer = tf.contrib.layers.xavier_initializer(uniform=True)
else:
initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
# word embeddings
with tf.variable_scope("input_layer"):
self.embedding = tf.get_variable("embedding", [self.vocab_size, self.embed_size],
initializer=initializer)
# weights and biases of the bottleneck layer (if used)
if config.bottleneck_size:
with tf.variable_scope("bottleneck_layer"):
self.bottleneck_w = tf.get_variable("bottleneck_w",
[self.recurrent_state_size, config.bottleneck_size],
initializer=initializer)
self.bottleneck_b = tf.get_variable("bottleneck_b", [config.bottleneck_size],
initializer=initializer)
# weights and biases of the hidden-to-output layer
with tf.variable_scope("output_layer"):
self.output_w = tf.get_variable("output_w",
[self.last_layer_size, self.vocab_size],
initializer=initializer)
self.output_b = tf.get_variable("output_b", [self.vocab_size],
initializer=initializer)
# ##############################################################
# ######### BUILD THE LM NETWORK GRAPH ##########
# extract the embedding of each char input in the batch
inputs = tf.nn.embedding_lookup(self.embedding, self.input_data)
# apply dropout to the input if needed.
if self.training and self.input_keep_prob < 1:
inputs = tf.nn.dropout(inputs, self.input_keep_prob)
# rearrange our input shape to create the training sequence
# we create a sequence made of the vertical slices the input
inputs = tf.split(inputs, config.seq_length, 1)
inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
# build the separate forward and backward lstm cells
self.cell = self.build_recurrence_cell(config)
# initialize the hidden (recurrent) state to zero
self.initial_state = self.cell.zero_state(config.batch_size, tf.float32)
# build the LM and update the hidden state
rec_state, self.final_state = self.time_sequence_graph(inputs)
if config.bottleneck_size:
last_layer = self.activation(tf.nn.xw_plus_b(rec_state, self.bottleneck_w, self.bottleneck_b))
else:
last_layer = rec_state
# self.logits = tf.matmul(output, self.output_w) + self.output_b
# self.probs = tf.nn.softmax(self.logits)
logits = tf.nn.xw_plus_b(last_layer, self.output_w, self.output_b)
# reshape logits to be a 3-D tensor for sequence loss
self.logits = tf.reshape(logits, [config.batch_size, config.seq_length, self.vocab_size])
loss = tf.contrib.seq2seq.sequence_loss(
self.logits,
self.targets,
tf.ones([config.batch_size, config.seq_length], dtype=data_type()),
average_across_timesteps=False,
average_across_batch=True)
with tf.name_scope('cost'):
self.cost = tf.reduce_sum(loss)
###################################################
# If we are in the training stage, then calculate the loss, back-propagate
# the error and update the weights, biases and word embeddings
if self.training:
self.lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
# clip the gradient by norm
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), config.grad_clip)
# update variables (weights, biases, embeddings...)
with tf.name_scope('optimizer'):
# optimizer = tf.train.AdamOptimizer(self.lr)
optimizer = tf.train.GradientDescentOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(zip(grads, tvars),
global_step=tf.contrib.framework.get_or_create_global_step())
def build_recurrence_cell(self, config):
"""
Build and return the recurrent cell that will be used by our LM.
This class uses only the built-in Tensorflow
"""
# if needed, the activation function used by the basic model can change be changed as well
activation_ = tf.nn.tanh
if self.model == 'rnn':
_cell_ = rnn.BasicRNNCell
elif self.model == 'gru':
_cell_ = rnn.GRUCell
elif self.model == "lstmp":
_cell_ = rnn.LSTMCell
elif self.model == "lstm":
_cell_ = rnn.BasicLSTMCell
else:
raise Exception("model type not supported: {}".format(self.model))
cells = []
# Apply dropout if required
for _ in range(self.num_layers):
if self.model == "lstmp": # you can change the activation function of the project layer
cell = _cell_(self.hidden_state_size, use_peepholes=self.use_peepholes, num_proj=config.lstmp_proj_size)
else:
cell = _cell_(self.hidden_state_size, activation=activation_)
if self.training and self.output_keep_prob < 1.0:
cell = rnn.DropoutWrapper(cell, output_keep_prob=self.output_keep_prob)
cells.append(cell)
# build and return the TF multi-recurrent cell graph
return rnn.MultiRNNCell(cells, state_is_tuple=True)
def time_sequence_graph(self, inputs):
"""
Apply the recurrence cell to an input sequence (each batch entry is a sequence of words).
return: stacked cell outputs of the complete sequence in addition to the last hidden state
(and memory for LSTM/LSTMP) obtained after processing the last word (in each batch entry).
"""
outputs, last_state = legacy_seq2seq.rnn_decoder(inputs, self.initial_state, self.cell, loop_function=None)
output = tf.reshape(tf.concat(outputs, 1), [-1, self.recurrent_state_size])
return output, last_state
def run_model(self, session, data, eval_op=None, verbosity=10000, verbose=False):
"""
Train or test the current model on some given data.
This basically trains/applies the model on some data
loaded by the data processor.
This will help training on a large corpus by splitting
them into smaller chunks and processing them one by one.
"""
data.reset_batch_pointer()
start_time = time.time()
costs = 0.0
iters = 0
state = session.run(self.initial_state)
fetches = {
"cost": self.cost,
"final_state": self.final_state,
}
if eval_op is not None:
fetches["eval_op"] = eval_op
print_tresh = 0
for step in range(data.num_batches):
indata, target = data.next_batch()
feed_dict = {self.initial_state: state, self.input_data: indata, self.targets: target}
vals = session.run(fetches, feed_dict)
cost = vals["cost"]
state = vals["final_state"]
costs += cost
iters += data.seq_length
total_proc_words = float((iters-1)*data.batch_size)
if verbose and (step == 0 or total_proc_words > print_tresh or step == data.num_batches-1):
print("[INFO] Progress: {:.2f}% | "
"Perplexity: {:.3f} | "
"Total Words: {:.1f}K | "
"Speed: {:.1f}K word/second"
.format((step+1) / data.num_batches * 100, np.exp(costs/iters),
total_proc_words / 1000,
total_proc_words / (1000 * (time.time() - start_time))))
print_tresh += verbosity
return np.exp(costs / iters)