-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlsrc.py
executable file
·350 lines (287 loc) · 15.4 KB
/
lsrc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# Copyright 2018 Saarland University, Spoken Language
# Systems LSV (author: Youssef Oualil, during his work period at LSV)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS*, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
#
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
###############################################################################
# Parts of this code are based on the Tensorflow PTB-LM recipe licensed under
# the Apache License, Version 2.0 by the TensorFlow Authors.
# (Source: https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/ptb_word_lm.py
# retrieved in January 2018)
###############################################################################
import time
import tensorflow as tf
import numpy as np
from tensorflow.contrib import rnn
from tensorflow.contrib import legacy_seq2seq
from utils import xewy_plus_z
def data_type():
return tf.float32
# Not used
# LSRCTuple = collections.namedtuple("LSRC", ("Local", "Global"))
class LM(object):
"""
This classe implements the LSRC model.
"""
def __init__(self, config, training=True):
"""
The constructor of the LSRC-LM. We define here the complete graph.
"""
# store the configuration for the future
self.config = config
# define the attributes of the LSRC model
self.training = training
if config.activation == 'tanh':
self.activation = tf.nn.tanh
elif config.activation == 'sigmoid':
self.activation = tf.nn.sigmoid
elif config.activation == 'relu':
self.activation = tf.nn.relu
elif config.activation == 'elu':
self.activation = tf.nn.elu
elif config.activation == 'relu6':
self.activation = tf.nn.relu6
self.model = 'lsrc'
self.history_size = 1
self.init_method = config.init_method
self.num_layers = config.num_layers
self.input_keep_prob = config.input_keep_prob
self.output_keep_prob = config.output_keep_prob
self.embed_size = config.embed_size
self.bottleneck_size = config.bottleneck_size
self.local_state_size = config.embed_size
self.global_state_size = config.hidden_size
self.vocab_size = config.vocab_size
self.use_lstmp = (config.lstmp_proj_size or config.use_peepholes)
###############################################################
# ############# DEFINE THE PLACEHOLDERS ##############
# placeholder for the training input data and target words
self.input_data = tf.placeholder(
tf.int32, [config.batch_size, config.seq_length])
self.targets = tf.placeholder(
tf.int32, [config.batch_size, config.seq_length])
###############################################################
# ###### DEFINE TRAINABLE VARIABLES (WEIGHTS AND BIASES) #####
# define the initializer of embeddings, weights and biases
if self.init_method == "xavier":
initializer = tf.contrib.layers.xavier_initializer(uniform=True)
else:
initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
# word embeddings
with tf.variable_scope("input_layer"):
self.embedding = tf.get_variable("embedding", [config.vocab_size, config.embed_size],
initializer=initializer)
local_weight_init = np.random.uniform(0.0, 1.0, self.embed_size)
with tf.variable_scope("lsrc_layer"):
local_initializer = tf.constant_initializer(local_weight_init)
local_weight = tf.get_variable("lsrc_local_weights", self.embed_size,
initializer=local_initializer)
# weights and biases of the bottleneck layer (if used)
last_layer_size = self.global_state_size
if self.bottleneck_size:
last_layer_size = self.bottleneck_size
with tf.variable_scope("bottleneck_layer"):
self.bottleneck_w = tf.get_variable("bottleneck_w",
[self.global_state_size, self.bottleneck_size],
initializer=initializer)
self.bottleneck_b = tf.get_variable("bottleneck_b", [self.bottleneck_size],
initializer=initializer)
# weights and biases of the hidden-to-output layer
with tf.variable_scope("output_layer"):
self.output_w = tf.get_variable("output_w",
[last_layer_size, self.vocab_size],
initializer=initializer)
self.output_b = tf.get_variable("output_b", [self.vocab_size],
initializer=initializer)
###############################################################
# ######### BUILD THE LM NETWORK GRAPH ##########
# extract the embedding of each char input in the batch
inputs = tf.nn.embedding_lookup(self.embedding, self.input_data)
# apply dropout to the input if needed.
if self.training and self.input_keep_prob < 1:
inputs = tf.nn.dropout(inputs, self.input_keep_prob)
# rearrange our input shape to create the training sequence
# we create a sequence made of the vertical slices the input
inputs = tf.split(inputs, config.seq_length, 1)
inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
self.global_cell = self.build_lsrc_global_cells(config)
# initialize all LSRC states to zero
# The next two lines are just a hack to initialize the SRNN cell from
# the TF built-in RNN cell.
local_cell = rnn.BasicRNNCell(self.embed_size)
self.local_state = local_cell.zero_state(config.batch_size, tf.float32)
self.global_state = self.global_cell.zero_state(config.batch_size, tf.float32)
# build the LM and update the hidden state
rec_local_state, self.final_local_state = self.lsrc_local_sequence_graph(config, inputs)
# build the LM and update the hidden state
rec_global_state, self.final_global_state = self.lsrc_global_sequence_graph(config, rec_local_state)
# apply bottleneck if used
if self.bottleneck_size:
last_layer = self.activation(tf.nn.xw_plus_b(rec_global_state, self.bottleneck_w, self.bottleneck_b))
else:
last_layer = rec_global_state
# self.logits = tf.matmul(output, self.output_w) + self.output_b
# self.probs = tf.nn.softmax(self.logits)
logits = tf.nn.xw_plus_b(last_layer, self.output_w, self.output_b)
# Reshape logits to be a 3-D tensor for sequence loss
self.logits = tf.reshape(logits, [config.batch_size, config.seq_length, config.vocab_size])
loss = tf.contrib.seq2seq.sequence_loss(
self.logits,
self.targets,
tf.ones([config.batch_size, config.seq_length], dtype=data_type()),
average_across_timesteps=False,
average_across_batch=True)
with tf.name_scope('cost'):
self.cost = tf.reduce_sum(loss)
###################################################
# Training stage
# If we are in the training stage, then calculate the loss, back-propagate
# the error and update the weights, biases and word embeddings
if self.training:
self.lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
# perform gradient clipping
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), config.grad_clip)
# update variables (weights, biases, embeddings...)
with tf.name_scope('optimizer'):
# optimizer = tf.train.AdamOptimizer(self.lr)
# self.train_op = optimizer.apply_gradients(zip(grads, tvars))
optimizer = tf.train.GradientDescentOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(
zip(grads, tvars),
global_step=tf.contrib.framework.get_or_create_global_step())
def build_lsrc_global_cells(self, config):
"""
Build and return the global recurrent cell of the LSRC model.
"""
cells_ = []
for _ in range(config.num_layers):
# define the global state of the LSRC model at this layer
if self.use_lstmp:
global_cell_ = rnn.LSTMCell(self.global_state_size, use_peepholes=config.use_peepholes,
num_proj=config.lstmp_proj_size)
else:
global_cell_ = rnn.BasicLSTMCell(self.global_state_size)
# apply dropout if specified
if self.training and config.output_keep_prob < 1.0:
global_cell_ = rnn.DropoutWrapper(global_cell_, output_keep_prob=config.output_keep_prob)
# curren_lsrc_layer = LSRCTuple(local_cell_, global_cell_)
cells_.append(global_cell_)
# build and return the recurrent cell graph of LSRC
return rnn.MultiRNNCell(cells_, state_is_tuple=True)
def lsrc_local_sequence_graph(self, config, inputs):
"""
Build the recurrence graph of the local state of the LSRC model.
It returns a list of the hidden outputs and the last hidden layer
"""
outputs = []
state = self.local_state
with tf.variable_scope("lsrc_layer", reuse=True):
lsrc_local_weights = tf.get_variable("lsrc_local_weights")
activation_ = tf.nn.tanh
for i in range(config.seq_length):
state = xewy_plus_z(lsrc_local_weights, state, inputs[i], activation=activation_)
outputs.append(state)
last_state = outputs[-1]
# outputs = tf.reshape(tf.concat(outputs, 1), [-1, self.local_state_size])
# apply dropout to the input if required.
if self.training and self.output_keep_prob < 1:
outputs = tf.nn.dropout(outputs, self.output_keep_prob)
outputs = tf.split(outputs, config.seq_length, 0)
outputs = [tf.squeeze(output_, [0]) for output_ in outputs]
return outputs, last_state
def lsrc_global_sequence_graph(self, config, inputs):
"""
Build the recurrence graph of the global state of the LSRC model.
It returns a list of the hidden outputs and the last hidden layer
"""
outputs, last_state = legacy_seq2seq.rnn_decoder(inputs, self.global_state, self.global_cell,
loop_function=None)
output = tf.reshape(tf.concat(outputs, 1), [-1, self.global_state_size])
return output, last_state
def run_model(self, session, data, eval_op=None, verbosity=10000, verbose=False):
"""
Train or test the current model on some given data.
This basically trains/applies the model on some data
loaded by the data processor.
This will help training on a large corpus by splitting
them into smaller chunks and processing them one by one.
"""
data.reset_batch_pointer()
start_time = time.time()
costs = 0.0
iters = 0
local_state = session.run(self.local_state)
global_state = session.run(self.global_state)
fetches = {
"cost": self.cost,
"final_local_state": self.final_local_state,
"final_global_state": self.final_global_state,
}
if eval_op is not None:
fetches["eval_op"] = eval_op
print_tresh = 0
for step in range(data.num_batches):
indata, target = data.next_batch()
feed_dict = {self.local_state: local_state, self.global_state: global_state,
self.input_data: indata, self.targets: target}
vals = session.run(fetches, feed_dict)
cost = vals["cost"]
local_state = vals["final_local_state"]
global_state = vals["final_global_state"]
costs += cost
iters += data.seq_length
total_proc_words = float((iters-1)*data.batch_size)
if verbose and (step == 0 or total_proc_words > print_tresh or step == data.num_batches-1):
print("[INFO] Progress: {:.2f}% | "
"Perplexity: {:.3f} | "
"Total Words: {:.1f}K | "
"Speed: {:.1f}K word/second"
.format((step+1) / data.num_batches * 100, np.exp(costs/iters),
total_proc_words / 1000,
total_proc_words / (1000 * (time.time() - start_time))))
print_tresh += verbosity
return np.exp(costs / iters)
#########################################################################################
# # currently not used, but could be if we do not use the TF built-in local and global cells
# def initialize_lsrc_cells(self, config):
#
# initial_state = []
# for layer in range(self.num_layers):
# initial_local_state_ = self.cell[layer].Local.zero_state(config.batch_size, tf.float32)
# initial_global_state_ = self.cell[layer].Global.zero_state(config.batch_size, tf.float32)
# curren_lsrc_init = LSRCTuple(initial_local_state_, initial_global_state_)
# initial_state.append(curren_lsrc_init)
# return initial_state
# # currently not used, but could be if we do not use the TF built-in local and global cells
# def lsrc_sequence_graph_old(self, config, inputs):
# """
# Build the recurrence graph of the LSRC model.
# It returns the output and the last hidden layer
# """
#
# outputs = inputs
# last_state = []
#
# for layer in range(self.num_layers):
# inputs, last_local_state_ = legacy_seq2seq.rnn_decoder(outputs, self.initial_state[layer].Local,
# self.cell[layer].Local, loop_function=None)
# outputs, last_global_state_ = legacy_seq2seq.rnn_decoder(inputs, self.initial_state[layer].Global,
# self.cell[layer].Global, loop_function=None)
# last_lsrc_states_ = LSRCTuple(last_local_state_, last_global_state_)
# last_state.append(last_lsrc_states_)
#
# output = tf.reshape(tf.concat(outputs, 1), [-1, self.global_state_size])
#
# return output, last_state