diff --git a/README.md b/README.md index 595fccb..3f137d5 100644 --- a/README.md +++ b/README.md @@ -14,3 +14,10 @@ The root directory of this repository contains the code required to perform extr The entry point is [model.py](model.py), which can use tagging datasets created using the [make_dataset.py](make_dataset.py) script. Note that `model.py` accepts pre-trained Word Embedding models via **text files** with no header. For Mimick models, this exact format is output into the path in [mimick/model.py](mimick/model.py) script's `--output` argument. For Word2Vec, FastText, or Polyglot models, one can create such a file using the [scripts/output_word_vectors.py](scripts/output_word_vectors.py) script that accepts a model (.pkl or .bin) and the desired output vocabulary (.txt). + +## CNN Experiment (October 2017) +References: +- Zhang et al., 2015. [Character-level Convolutional Networks for Text Classification](https://arxiv.org/abs/1509.01626). +- dos Santos and Zadrozny, 2014. [Learning Character-level Representations for Part-of-Speech Tagging](http://proceedings.mlr.press/v32/santos14.pdf). Applied to POS tagging. +- Shen et al., CIKM 2014. [A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval](https://dl.acm.org/citation.cfm?id=2661935). Extracted from word n-grams, applied to IR. +- Kanaris et al., 2007. [Words Versus Character N-Grams For Anti-Spam Filtering](http://www.worldscientific.com/doi/abs/10.1142/S0218213007003692). \ No newline at end of file diff --git a/mimick/consts.py b/mimick/consts.py new file mode 100644 index 0000000..b6cb8f9 --- /dev/null +++ b/mimick/consts.py @@ -0,0 +1,17 @@ +# REPRESENTATION +POLYGLOT_UNK = unicode("") +PADDING_CHAR = "<*>" + +# MODEL PARAMS +DEFAULT_CHAR_DIM = 20 +DEFAULT_HIDDEN_DIM = 50 +DEFAULT_WORD_DIM = 64 + +# LSTM +DEFAULT_LSTM_LAYERS = 1 + +# CNN +DEFAULT_WINDOW_WIDTH = 3 +DEFAULT_POOLING_MAXK = 1 +DEFAULT_STRIDE = 1 +DEFAULT_CNN_LAYERS = 1 \ No newline at end of file diff --git a/mimick/inter_nearest_vecs.py b/mimick/inter_nearest_vecs.py index 174c819..6b3407a 100755 --- a/mimick/inter_nearest_vecs.py +++ b/mimick/inter_nearest_vecs.py @@ -7,7 +7,9 @@ import numpy as np import collections import argparse -from model import LSTMMimick +from model import LSTMMimick, CNNMimick +from make_dataset import read_text_embs +from consts import * __author__ = "Yuval Pinter, 2017" @@ -22,19 +24,43 @@ def dist(v1, v2): if __name__ == "__main__": # parse command line arguments parser = argparse.ArgumentParser() - parser.add_argument("--mimick", required=True, dest="mimick", help="Mimick model file") - parser.add_argument("--c2i", required=True, dest="c2i", help="Mimick char-to-integer mapping file") - parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file with reference word vectors") - parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format") - #parser.add_argument("--ktop", dest="ktop", default=10, help="Number of top neighbors to present (optional)") + parser.add_argument("--mimick", required=True, help="Mimick model file") + parser.add_argument("--use-cnn", action="store_true", help="Use CNN model") + parser.add_argument("--c2i", required=True, help="Mimick char-to-integer mapping file") + parser.add_argument("--vectors", required=True, help="Pickle file with reference word vectors") + parser.add_argument("--w2v-format", action="store_true", help="Vector file is in textual w2v format") + parser.add_argument("--ktop", type=int, default=10, help="Number of top neighbors to present (optional)") + parser.add_argument("--char-dim", type=int, default=DEFAULT_CHAR_DIM, help="dimension for character embeddings (default = {})".format(DEFAULT_CHAR_DIM)) + parser.add_argument("--hidden-dim", type=int, default=DEFAULT_HIDDEN_DIM, help="dimension for LSTM layers (default = {})".format(DEFAULT_HIDDEN_DIM)) + ### LSTM ### + parser.add_argument("--num-lstm-layers", type=int, default=DEFAULT_LSTM_LAYERS, help="Number of LSTM layers (default = {})".format(DEFAULT_LSTM_LAYERS)) + ### CNN ### + parser.add_argument("--num-conv-layers", type=int, default=DEFAULT_CNN_LAYERS, help="Number of CNN layers (default = 1)") + parser.add_argument("--window-width", type=int, default=DEFAULT_WINDOW_WIDTH, help="Width of CNN layers (default = 3)") + parser.add_argument("--pooling-maxk", type=int, default=DEFAULT_POOLING_MAXK, help="K for K-max pooling (default = 1)") + parser.add_argument("--stride", dest="w_stride", default=DEFAULT_STRIDE, help="'Width' stride for CNN layers (default = 1)") + opts = parser.parse_args() + # load vocab + if opts.w2v_format: + voc_words, voc_vecs = read_text_embs(opts.vectors) + else: + voc_words, voc_vecs = pickle.load(open(opts.vectors)) + + we_dim = len(voc_vecs[0]) + # load model c2i = pickle.load(open(opts.c2i)) - mimick = LSTMMimick(c2i, file=opts.mimick) - - # load vocab - voc_words, voc_vecs = pickle.load(open(opts.vectors)) + if opts.use_cnn: + mimick = CNNMimick(c2i, num_conv_layers=opts.num_conv_layers, char_dim=opts.char_dim,\ + hidden_dim=opts.hidden_dim, window_width=opts.window_width,\ + pooling_maxk=opts.pooling_maxk, w_stride=opts.w_stride,\ + word_embedding_dim=we_dim, file=opts.mimick) + else: + mimick = LSTMMimick(c2i, num_lstm_layers=opts.num_lstm_layers, char_dim=opts.char_dim,\ + hidden_dim=opts.hidden_dim,\ + word_embedding_dim=we_dim, file=opts.mimick) # prompt while True: @@ -45,5 +71,5 @@ def dist(v1, v2): word_chars = [c2i[c] for c in next_word] pred_vec = mimick.predict_emb(word_chars).value() - top_k = sorted([(iv, dist(iv_vec, pred_vec)) for iv,iv_vec in zip(voc_words, voc_vecs)], key=lambda x: x[1])[:10] + top_k = sorted([(iv, dist(iv_vec, pred_vec)) for iv,iv_vec in zip(voc_words, voc_vecs)], key=lambda x: x[1])[:opts.ktop] print '\n'.join(['{}:\t{:.3f}'.format(near[0], 1.0 - near[1]) for near in top_k]) diff --git a/mimick/make_dataset.py b/mimick/make_dataset.py index b4d9b98..8e20d9d 100644 --- a/mimick/make_dataset.py +++ b/mimick/make_dataset.py @@ -1,9 +1,10 @@ ''' Creates dataset for trained-embeddings prediction by character Bi-LSTM. Inputs: -- A pre-trained embedding model that is to be emulated by character model -- A set of downstream-task vocab words, those of which not present in the +- A pre-trained embedding dictionary that is to be emulated by character model +- An optional set of downstream-task vocab words, those of which not present in the pre-trained embeddings will be output by the character model + (only important for sanity statistics following model training) ''' from __future__ import division from _collections import defaultdict @@ -13,6 +14,8 @@ import collections import numpy as np +from util import charseq + __author__ = "Yuval Pinter, 2017" POLYGLOT_UNK = unicode("") @@ -32,66 +35,65 @@ def read_text_embs(filename): embs.append(np.array([float(s) for s in split[1:]])) return words, embs -def charseq(word, c2i): - chars = [] - for c in word: - if c not in c2i: - c2i[c] = len(c2i) - chars.append(c2i[c]) - return chars - -# parse arguments -parser = argparse.ArgumentParser() -parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file from which to get target word vectors") -parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format") -parser.add_argument("--vocab", required=True, dest="vocab", help="File containing words for unlabeled test set") -parser.add_argument("--output", required=True, dest="output", help="Output filename (.pkl)") +if __name__ == "__main__": + # parse arguments + parser = argparse.ArgumentParser() + parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file from which to get target word vectors") + parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format") + parser.add_argument("--vocab", dest="vocab", help="File containing words for unlabeled test set (optional)") + parser.add_argument("--output", required=True, dest="output", help="Output filename (.pkl)") -options = parser.parse_args() + options = parser.parse_args() -c2i = {} -training_instances = [] -test_instances = [] + c2i = {} + training_instances = [] + test_instances = [] -# Read in the output vocab -with codecs.open(options.vocab, "r", "utf-8") as f: - vocab = set([ line.strip() for line in f ]) + # Read in the output vocab + if options.vocab is None: + vocab = [] + else: + with codecs.open(options.vocab, "r", "utf-8") as f: + vocab = set([ line.strip() for line in f ]) -# read embeddings file -if options.w2v_format: - words, embs = read_text_embs(options.vectors) -else: - words, embs = cPickle.load(open(options.vectors, "r")) -dim = len(embs[0]) -word_to_ix = {w : i for (i,w) in enumerate(words)} + # read embeddings file + if options.w2v_format: + words, embs = read_text_embs(options.vectors) + else: + words, embs = cPickle.load(open(options.vectors, "r")) + dim = len(embs[0]) + word_to_ix = {w : i for (i,w) in enumerate(words)} -with codecs.open(options.output, "w", "utf-8") as outfile: in_vocab = 0 - total = len(vocab) for word, emb in zip(words, embs): if word == POLYGLOT_UNK or word == W2V_UNK: continue if word in vocab: in_vocab += 1 training_instances.append(Instance(charseq(word, c2i), emb)) training_char_count = len(c2i) - for v in vocab: - if v not in words: - test_instances.append(Instance(charseq(v, c2i), np.array([0.0] * dim))) - print "Total Number of output words:", total - print "Total in Training Vocabulary:", in_vocab - print "Percentage in-vocab:", in_vocab / total print "Total in Embeddings vocabulary:", len(words) print "Training set character count: ", training_char_count - print "Total haracter count: ", len(c2i) -c2i[PADDING_CHAR] = len(c2i) + # Test + if len(vocab) > 0: + in_vocab = 0 + total = len(vocab) + for v in vocab: + if v not in words: + test_instances.append(Instance(charseq(v, c2i), np.array([0.0] * dim))) + print "Total Number of output words:", total + print "Total in Training Vocabulary:", in_vocab + print "Percentage in-vocab:", in_vocab / total + print "Total character count: ", len(c2i) + + c2i[PADDING_CHAR] = len(c2i) -# populate output -output = {} -output["c2i"] = c2i -output["training_instances"] = training_instances -output["test_instances"] = test_instances + # populate output + output = {} + output["c2i"] = c2i + output["training_instances"] = training_instances + output["test_instances"] = test_instances -# write output -with open(options.output, "w") as outfile: - cPickle.dump(output, outfile) + # write output + with open(options.output, "w") as outfile: + cPickle.dump(output, outfile) diff --git a/mimick/model.py b/mimick/model.py index 2936dc9..95ea8f6 100644 --- a/mimick/model.py +++ b/mimick/model.py @@ -17,43 +17,139 @@ import dynet as dy import numpy as np +from util import wordify, charseq +from consts import * + __author__ = "Yuval Pinter, 2017" -POLYGLOT_UNK = unicode("") -PADDING_CHAR = "<*>" +Instance = collections.namedtuple("Instance", ["chars", "word_emb"]) -DEFAULT_CHAR_DIM = 20 -DEFAULT_HIDDEN_DIM = 50 +###################################### -Instance = collections.namedtuple("Instance", ["chars", "word_emb"]) +class CNNMimick: + ''' + Implementation details inferred from + http://dynet.readthedocs.io/en/latest/python_ref.html#convolution-pooling-operations + Another dynet CNN impl example: https://goo.gl/UFkSwf + ''' + + def __init__(self, c2i, num_conv_layers=DEFAULT_CNN_LAYERS, char_dim=DEFAULT_CHAR_DIM,\ + hidden_dim=DEFAULT_HIDDEN_DIM, window_width=DEFAULT_WINDOW_WIDTH,\ + pooling_maxk=DEFAULT_POOLING_MAXK, w_stride=DEFAULT_STRIDE,\ + word_embedding_dim=DEFAULT_WORD_DIM, file=None): + self.c2i = c2i + self.pooling_maxk = pooling_maxk + self.stride = [1, w_stride] + self.char_dim = char_dim + self.hidden_dim = hidden_dim + self.window_width = window_width + self.model = dy.Model() + + ### TODO allow more layers (create list of length num_conv_layers, + ### don't forget relu and max-pooling after each in predict_emb) + if num_conv_layers > 1: + print "Warning: unsupported value passed for number of CNN layers: {}. Using 1 instead."\ + .format(num_conv_layers) + + self.char_lookup = self.model.add_lookup_parameters((len(c2i), char_dim), name="ce") + self.conv = self.model.add_parameters((1, window_width, char_dim, hidden_dim), name="conv") + self.conv_bias = self.model.add_parameters((hidden_dim), name="convb") + + self.cnn_to_rep_params = self.model.add_parameters((word_embedding_dim, hidden_dim * pooling_maxk), name="H") + self.cnn_to_rep_bias = self.model.add_parameters(word_embedding_dim, name="Hb") + self.mlp_out = self.model.add_parameters((word_embedding_dim, word_embedding_dim), name="O") + self.mlp_out_bias = self.model.add_parameters(word_embedding_dim, name="Ob") + + if file is not None: + ### NOTE - dynet 2.0 only supports explicit loading into params, so + ### dimensionalities all need to be specified in init + self.model.populate(file) + + def predict_emb(self, chars): + dy.renew_cg() + + conv_param = dy.parameter(self.conv) + conv_param_bias = dy.parameter(self.conv_bias) + + H = dy.parameter(self.cnn_to_rep_params) + Hb = dy.parameter(self.cnn_to_rep_bias) + O = dy.parameter(self.mlp_out) + Ob = dy.parameter(self.mlp_out_bias) + + # padding + pad_char = self.c2i[PADDING_CHAR] + padding_size = self.window_width // 2 # TODO also consider w_stride? + char_ids = ([pad_char] * padding_size) + chars + ([pad_char] * padding_size) + if len(chars) < self.pooling_maxk: + # allow k-max pooling layer output to transform to affine + char_ids.extend([pad_char] * (self.pooling_maxk - len(chars))) + + embeddings = dy.concatenate_cols([self.char_lookup[cid] for cid in char_ids]) + reshaped_embeddings = dy.reshape(dy.transpose(embeddings), (1, len(char_ids), self.char_dim)) + + # not using is_valid=False due to maxk-pooling-induced extra padding + conv_out = dy.conv2d_bias(reshaped_embeddings, conv_param, conv_param_bias, self.stride, is_valid=True) + + relu_out = dy.rectify(conv_out) + + ### pooling when max_k can only be 1, not sure what other differences may be + #poolingk = [1, len(chars)] + #pooling_out = dy.maxpooling2d(relu_out, poolingk, self.stride, is_valid=True) + #pooling_out_flat = dy.reshape(pooling_out, (self.hidden_dim,)) + + ### another possible way for pooling is just max_dim(relu_out, d=1) + + pooling_out = dy.kmax_pooling(relu_out, self.pooling_maxk, d=1) # d = what dimension to max over + pooling_out_flat = dy.reshape(pooling_out, (self.hidden_dim * self.pooling_maxk,)) + + return O * dy.tanh(H * pooling_out_flat + Hb) + Ob + + def loss(self, observation, target_rep): + return dy.squared_distance(observation, dy.inputVector(target_rep)) + + def set_dropout(self, p): + ### TODO see if supported/needed + pass + + def disable_dropout(self): + ### TODO see if supported/needed + pass + + def save(self, file_name): + self.model.save(file_name) + # character mapping saved separately + cPickle.dump(self.c2i, open(file_name[:-4] + '.c2i', 'w')) + + @property + def model(self): + return self.model + +###################################### class LSTMMimick: - def __init__(self, c2i, num_lstm_layers=-1,\ - char_dim=-1, hidden_dim=-1, word_embedding_dim=-1, file=None): + def __init__(self, c2i, num_lstm_layers=DEFAULT_LSTM_LAYERS,\ + char_dim=DEFAULT_CHAR_DIM, hidden_dim=DEFAULT_HIDDEN_DIM,\ + word_embedding_dim=DEFAULT_WORD_DIM, file=None): self.c2i = c2i self.model = dy.Model() - if file == None: - # Char LSTM Parameters - self.char_lookup = self.model.add_lookup_parameters((len(c2i), char_dim)) - self.char_fwd_lstm = dy.LSTMBuilder(num_lstm_layers, char_dim, hidden_dim, self.model) - self.char_bwd_lstm = dy.LSTMBuilder(num_lstm_layers, char_dim, hidden_dim, self.model) - - # Post-LSTM Parameters - self.lstm_to_rep_params = self.model.add_parameters((word_embedding_dim, hidden_dim * 2)) - self.lstm_to_rep_bias = self.model.add_parameters(word_embedding_dim) - self.mlp_out = self.model.add_parameters((word_embedding_dim, word_embedding_dim)) - self.mlp_out_bias = self.model.add_parameters(word_embedding_dim) - else: - # read from saved file. c2i mapping to be read by calling function (for now) - model_members = iter(self.model.load(file)) - self.char_lookup = model_members.next() - self.char_fwd_lstm = model_members.next() - self.char_bwd_lstm = model_members.next() - self.lstm_to_rep_params = model_members.next() - self.lstm_to_rep_bias = model_members.next() - self.mlp_out = model_members.next() - self.mlp_out_bias = model_members.next() + + # Char LSTM Parameters + self.char_lookup = self.model.add_lookup_parameters((len(c2i), char_dim), name="ce") + self.char_fwd_lstm = dy.LSTMBuilder(num_lstm_layers, char_dim, hidden_dim, self.model) + self.char_bwd_lstm = dy.LSTMBuilder(num_lstm_layers, char_dim, hidden_dim, self.model) + + # Post-LSTM Parameters + self.lstm_to_rep_params = self.model.add_parameters((word_embedding_dim, hidden_dim * 2), name="H") + self.lstm_to_rep_bias = self.model.add_parameters(word_embedding_dim, name="Hb") + self.mlp_out = self.model.add_parameters((word_embedding_dim, word_embedding_dim), name="O") + self.mlp_out_bias = self.model.add_parameters(word_embedding_dim, name="Ob") + + if file is not None: + # read from saved file; see old_load() for dynet 1.0 format + ### NOTE - dynet 2.0 only supports explicit loading into params, so + ### dimensionalities all need to be specified in init + self.model.populate(file) def predict_emb(self, chars): dy.renew_cg() @@ -89,6 +185,12 @@ def disable_dropout(self): self.char_bwd_lstm.disable_dropout() def save(self, file_name): + self.model.save(file_name) + + # character mapping saved separately + cPickle.dump(self.c2i, open(file_name[:-4] + '.c2i', 'w')) + + def old_save(self, file_name): members_to_save = [] members_to_save.append(self.char_lookup) members_to_save.append(self.char_fwd_lstm) @@ -101,15 +203,22 @@ def save(self, file_name): # character mapping saved separately cPickle.dump(self.c2i, open(file_name[:-4] + '.c2i', 'w')) + + def old_load(): + # for this to load in __init__() there's no param init necessary + model_members = iter(self.model.load(file)) + self.char_lookup = model_members.next() + self.char_fwd_lstm = model_members.next() + self.char_bwd_lstm = model_members.next() + self.lstm_to_rep_params = model_members.next() + self.lstm_to_rep_bias = model_members.next() + self.mlp_out = model_members.next() + self.mlp_out_bias = model_members.next() @property def model(self): return self.model - -def wordify(instance): - return ''.join([i2c[i] for i in instance.chars]) - def dist(instance, vec): we = instance.word_emb if options.cosine: @@ -122,26 +231,35 @@ def dist(instance, vec): # Argument parsing # ===-----------------------------------------------------------------------=== parser = argparse.ArgumentParser() - parser.add_argument("--dataset", required=True, dest="dataset", help=".pkl file to use") - parser.add_argument("--vocab", required=True, dest="vocab", help="total vocab to output") - parser.add_argument("--output", dest="output", help="file with all embeddings") - parser.add_argument("--model-out", dest="model_out", help="file with model parameters") - parser.add_argument("--lang", dest="lang", default="en", help="language") - parser.add_argument("--char-dim", default=DEFAULT_CHAR_DIM, dest="char_dim", help="dimension for character embeddings (default = 20)") - parser.add_argument("--hidden-dim", default=DEFAULT_HIDDEN_DIM, dest="hidden_dim", help="dimension for LSTM layers (default = 50)") - parser.add_argument("--num-lstm-layers", default=1, dest="num_lstm_layers", help="Number of LSTM layers (default = 1)") - parser.add_argument("--all-from-lstm", dest="all_from_lstm", action="store_true", help="if toggled, vectors in original training set are overriden by LSTM-generated vectors") - parser.add_argument("--normalized-targets", dest="normalized_targets", action="store_true", help="if toggled, train on normalized vectors from set") - parser.add_argument("--dropout", default=-1, dest="dropout", type=float, help="amount of dropout to apply to LSTM part of graph") - parser.add_argument("--num-epochs", default=10, dest="num_epochs", type=int, help="Number of full passes through training set (default = 10)") - parser.add_argument("--learning-rate", default=0.01, dest="learning_rate", type=float, help="Initial learning rate") - parser.add_argument("--cosine", dest="cosine", action="store_true", help="Use cosine as diff measure") + parser.add_argument("--dataset", required=True, help=".pkl file to use") + parser.add_argument("--vocab", required=True, help="total vocab to output") + parser.add_argument("--output", help="file with all embeddings") + parser.add_argument("--model-out", help="file with model parameters") + parser.add_argument("--lang", default="en", help="language (optional, appears in log dir name)") + parser.add_argument("--char-dim", type=int, default=DEFAULT_CHAR_DIM, help="dimension for character embeddings (default = {})".format(DEFAULT_CHAR_DIM)) + parser.add_argument("--hidden-dim", type=int, default=DEFAULT_HIDDEN_DIM, help="dimension for LSTM layers (default = {})".format(DEFAULT_HIDDEN_DIM)) + ### LSTM ### + parser.add_argument("--num-lstm-layers", type=int, default=DEFAULT_LSTM_LAYERS, help="Number of LSTM layers (default = {})".format(DEFAULT_LSTM_LAYERS)) + ### CNN ### + parser.add_argument("--use-cnn", dest="cnn", action="store_true", help="if toggled, train CNN and not LSTM") + parser.add_argument("--num-conv-layers", type=int, default=DEFAULT_CNN_LAYERS, help="Number of CNN layers (default = 1, more not currently supported)") + parser.add_argument("--window-width", type=int, default=DEFAULT_WINDOW_WIDTH, help="Width of CNN layers (default = 3)") + parser.add_argument("--pooling-maxk", type=int, default=DEFAULT_POOLING_MAXK, help="K for K-max pooling (default = 1)") + parser.add_argument("--stride", default=DEFAULT_STRIDE, dest="w_stride", help="'Width' stride for CNN layers (default = 1)") + ### END ### + parser.add_argument("--all-from-mimick", action="store_true", help="if toggled, vectors in original training set are overriden by Mimick-generated vectors") + parser.add_argument("--normalized-targets", action="store_true", help="if toggled, train on normalized vectors from set") + parser.add_argument("--dropout", default=-1, type=float, help="amount of dropout to apply to LSTM part of graph") + parser.add_argument("--num-epochs", default=10, type=int, help="Number of full passes through training set (default = 10)") + parser.add_argument("--learning-rate", default=0.01, type=float, help="Initial learning rate") + parser.add_argument("--cosine", action="store_true", help="Use cosine as diff measure") parser.add_argument("--dynet-mem", help="Ignore this outside argument") - parser.add_argument("--debug", dest="debug", action="store_true", help="Debug mode") + parser.add_argument("--debug", action="store_true", help="Debug mode") options = parser.parse_args() # Set up logging - log_dir = "embedding_train_charlstm-{}-{}".format(datetime.datetime.now().strftime('%y%m%d%H%M%S'), options.lang) + log_dir = "embedding_train_mimick-{}-{}{}".format(datetime.datetime.now().strftime('%y%m%d%H%M%S'),\ + options.lang, '-DEBUG' if options.debug else '') os.mkdir(log_dir) root_logger = logging.getLogger() @@ -154,24 +272,53 @@ def dist(instance, vec): root_logger.info("Training dataset: {}".format(options.dataset)) root_logger.info("Output vocabulary: {}".format(options.vocab)) root_logger.info("Vectors output location: {}".format(options.output)) - root_logger.info("Model output location: {}\n".format(options.model_out)) - + root_logger.info("Model output location: {}".format(options.model_out)) + if options.all_from_mimick: + root_logger.info("All output vectors to be written from Mimick inference.") + + root_logger.info("\nModel Architecture: {}".format('CNN' if options.cnn else 'LSTM')) + if options.cnn: + root_logger.info("Layers: {}".format(options.num_conv_layers)) + root_logger.info("Window width: {}".format(options.window_width)) + root_logger.info("Pooling Max-K: {}".format(options.pooling_maxk)) + root_logger.info("Stride: {}".format(options.w_stride)) + else: + root_logger.info("Layers: {}".format(options.num_lstm_layers)) + root_logger.info("Character embedding dimension: {}".format(options.char_dim)) + root_logger.info("Hidden dimension: {}".format(options.hidden_dim)) + # Load training set dataset = cPickle.load(open(options.dataset, "r")) c2i = dataset["c2i"] i2c = { i: c for c, i in c2i.items() } # inverse map training_instances = dataset["training_instances"] test_instances = dataset["test_instances"] + populate_test_insts_from_vocab = len(test_instances) == 0 emb_dim = len(training_instances[0].word_emb) # Load words to write vocab_words = {} + if populate_test_insts_from_vocab: + train_words = [wordify(w, i2c) for w in training_instances] with codecs.open(options.vocab, "r", "utf-8") as vocab_file: for vw in vocab_file.readlines(): - vocab_words[vw.strip()] = np.array([0.0] * emb_dim) - - model = LSTMMimick(c2i, options.num_lstm_layers, options.char_dim, options.hidden_dim, emb_dim) - trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9, 0.1) + vw = vw.strip() + vocab_words[vw] = np.zeros(emb_dim) + if populate_test_insts_from_vocab and vw not in train_words: + test_instances.append(Instance(charseq(vw, c2i), np.zeros(emb_dim))) + + if populate_test_insts_from_vocab: + # need to update i2c if saw new characters + i2c = { i: c for c, i in c2i.items() } + + if not options.cnn: + model = LSTMMimick(c2i, options.num_lstm_layers, options.char_dim, options.hidden_dim, emb_dim) + else: + model = CNNMimick(c2i, options.num_conv_layers, options.char_dim, options.hidden_dim,\ + options.window_width, options.pooling_maxk, options.w_stride, emb_dim) + + #trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9) + trainer = dy.AdamTrainer(model.model, 0.001, 0.9, 0.999) root_logger.info("Training Algorithm: {}".format(type(trainer))) root_logger.info("Number training instances: {}".format(len(training_instances))) @@ -224,10 +371,10 @@ def dist(instance, vec): trainer.update() if epoch == epcs - 1: - word = wordify(instance) + word = wordify(instance, i2c) if word in vocab_words: pretrained_vec_norms += np.linalg.norm(instance.word_emb) - if options.all_from_lstm: + if options.all_from_mimick: vocab_words[word] = np.array(obs_emb.value()) inferred_vec_norms += np.linalg.norm(vocab_words[word]) else: # log vocab embeddings @@ -235,7 +382,7 @@ def dist(instance, vec): root_logger.info("\n") root_logger.info("Epoch {} complete".format(epoch + 1)) - trainer.update_epoch(1) + # here used to be a learning rate update, no longer supported in dynet 2.0 print trainer.status() # Evaluate dev data @@ -251,10 +398,10 @@ def dist(instance, vec): dev_loss += model.loss(obs_emb, instance.word_emb).scalar_value() if epoch == epcs - 1: - word = wordify(instance) + word = wordify(instance, i2c) if word in vocab_words: pretrained_vec_norms += np.linalg.norm(instance.word_emb) - if options.all_from_lstm: + if options.all_from_mimick: vocab_words[word] = np.array(obs_emb.value()) inferred_vec_norms += np.linalg.norm(vocab_words[word]) else: # log vocab embeddings @@ -267,32 +414,33 @@ def dist(instance, vec): root_logger.info("Average norm for pre-trained in vocab: {}".format(pretrained_vec_norms / len(vocab_words))) # Infer for test set - showcase_size = 25 + showcase_size = 5 top_to_show = 10 showcase = [] # sample for similarity sanity check for idx, instance in enumerate(test_instances): - word = wordify(instance) + word = wordify(instance, i2c) obs_emb = model.predict_emb(instance.chars) vocab_words[word] = np.array(obs_emb.value()) inferred_vec_norms += np.linalg.norm(vocab_words[word]) - # reservoir sampling - if idx < showcase_size: - showcase.append(word) - else: - rand = random.randint(0,idx-1) - if rand < showcase_size: - showcase[rand] = word + if options.debug: + # reservoir sampling + if idx < showcase_size: + showcase.append(word) + else: + rand = random.randint(0,idx-1) + if rand < showcase_size: + showcase[rand] = word root_logger.info("Average norm for trained: {}".format(inferred_vec_norms / len(test_instances))) - similar_words = {} - for w in showcase: - vec = vocab_words[w] - top_k = [(wordify(instance),d) for instance,d in sorted([(inst, dist(inst, vec)) for inst in training_instances], key=lambda x: x[1])[:top_to_show]] - if options.debug: + if options.debug: + similar_words = {} + for w in showcase: + vec = vocab_words[w] + top_k = [(wordify(instance, i2c),d) for instance,d in sorted([(inst, dist(inst, vec)) for inst in training_instances], key=lambda x: x[1])[:top_to_show]] print w, [(i,d) for i,d in top_k] - similar_words[w] = top_k + similar_words[w] = top_k # write all @@ -302,9 +450,9 @@ def dist(instance, vec): for vw, emb in vocab_words.iteritems(): writer.write(vw + " ") for i in emb: - writer.write(str(i) + " ") + writer.write("{:.6f} ".format(i)) writer.write("\n") # save model if options.model_out is not None: - model.save(options.model_out) \ No newline at end of file + model.save(options.model_out) diff --git a/mimick/util.py b/mimick/util.py new file mode 100644 index 0000000..9ac36c9 --- /dev/null +++ b/mimick/util.py @@ -0,0 +1,10 @@ +def wordify(instance, i2c): + return ''.join([i2c[i] for i in instance.chars]) + +def charseq(word, c2i): + chars = [] + for c in word: + if c not in c2i: + c2i[c] = len(c2i) + chars.append(c2i[c]) + return chars \ No newline at end of file diff --git a/model.py b/model.py index 7a52b4f..b76173f 100644 --- a/model.py +++ b/model.py @@ -67,7 +67,7 @@ def __init__(self, tagset_sizes, num_lstm_layers, hidden_dim, word_embeddings, n vocab_size = word_embeddings.shape[0] word_embedding_dim = word_embeddings.shape[1] - self.words_lookup = self.model.add_lookup_parameters((vocab_size, word_embedding_dim)) + self.words_lookup = self.model.add_lookup_parameters((vocab_size, word_embedding_dim), name="we") if word_embeddings is not None: self.words_lookup.init_from_array(word_embeddings) @@ -75,7 +75,7 @@ def __init__(self, tagset_sizes, num_lstm_layers, hidden_dim, word_embeddings, n # Char LSTM Parameters self.use_char_rnn = use_char_rnn if use_char_rnn: - self.char_lookup = self.model.add_lookup_parameters((charset_size, char_embedding_dim)) + self.char_lookup = self.model.add_lookup_parameters((charset_size, char_embedding_dim), name="ce") self.char_bi_lstm = dy.BiRNNBuilder(1, char_embedding_dim, hidden_dim, self.model, dy.LSTMBuilder) # Word LSTM parameters @@ -91,10 +91,10 @@ def __init__(self, tagset_sizes, num_lstm_layers, hidden_dim, word_embeddings, n self.mlp_out = {} self.mlp_out_bias = {} for att, set_size in tagset_sizes.items(): - self.lstm_to_tags_params[att] = self.model.add_parameters((set_size, hidden_dim)) - self.lstm_to_tags_bias[att] = self.model.add_parameters(set_size) - self.mlp_out[att] = self.model.add_parameters((set_size, set_size)) - self.mlp_out_bias[att] = self.model.add_parameters(set_size) + self.lstm_to_tags_params[att] = self.model.add_parameters((set_size, hidden_dim), name=att+"H") + self.lstm_to_tags_bias[att] = self.model.add_parameters(set_size, name=att+"Hb") + self.mlp_out[att] = self.model.add_parameters((set_size, set_size), name=att+"O") + self.mlp_out_bias[att] = self.model.add_parameters(set_size, name=att+"Ob") def word_rep(self, word, char_ids): ''' @@ -112,7 +112,10 @@ def word_rep(self, word, char_ids): def build_tagging_graph(self, sentence, word_chars): dy.renew_cg() - embeddings = [self.word_rep(w, chars) for w, chars in zip(sentence, word_chars)] + if word_chars == None: + embeddings = [self.word_rep(w, None) for w in sentence] + else: + embeddings = [self.word_rep(w, chars) for w, chars in zip(sentence, word_chars)] lstm_out = self.word_bi_lstm.transduce(embeddings) @@ -174,11 +177,21 @@ def set_dropout(self, p): def disable_dropout(self): self.word_bi_lstm.disable_dropout() - + def save(self, file_name): ''' Serialize model parameters for future loading and use. - Currently loaded using initializer in scripts/test_model.py + TODO change reading in scripts/test_model.py + ''' + self.model.save(file_name) + + with open(file_name + "-atts", 'w') as attdict: + attdict.write("\t".join(sorted(self.attributes))) + + def old_save(self, file_name): + ''' + Serialize model parameters for future loading and use. + Old version (pre dynet 2.0) loaded using initializer in scripts/test_model.py ''' members_to_save = [] members_to_save.append(self.words_lookup) @@ -333,7 +346,7 @@ def get_word_chars(sentence, i2w, c2i): vocab_size=len(w2i), word_embedding_dim=DEFAULT_WORD_EMBEDDING_SIZE) - trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9, 0.1) + trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9) logging.info("Training Algorithm: {}".format(type(trainer))) logging.info("Number training instances: {}".format(len(training_instances))) @@ -386,7 +399,7 @@ def get_word_chars(sentence, i2w, c2i): # log epoch's train phase logging.info("\n") logging.info("Epoch {} complete".format(epoch + 1)) - trainer.update_epoch(1) + # here used to be a learning rate update, no longer supported in dynet 2.0 print trainer.status() train_loss = train_loss / len(train_instances) @@ -483,8 +496,6 @@ def get_word_chars(sentence, i2w, c2i): logging.info("Removing files from previous epoch.") old_model_file_name = "{}/model_epoch-{:02d}.bin".format(options.log_dir, epoch) os.remove(old_model_file_name) - os.remove(old_model_file_name + ".pym") - os.remove(old_model_file_name + ".pyk") os.remove(old_model_file_name + "-atts") # epoch loop ends