yuvalpinter · yuvalpinter · Nov 22, 2017 · Oct 16, 2017 · Oct 23, 2017 · Oct 23, 2017
diff --git a/README.md b/README.md
@@ -14,3 +14,10 @@ The root directory of this repository contains the code required to perform extr
 
 The entry point is [model.py](model.py), which can use tagging datasets created using the [make_dataset.py](make_dataset.py) script.
 Note that `model.py` accepts pre-trained Word Embedding models via **text files** with no header. For Mimick models, this exact format is output into the path in [mimick/model.py](mimick/model.py) script's `--output` argument. For Word2Vec, FastText, or Polyglot models, one can create such a file using the [scripts/output_word_vectors.py](scripts/output_word_vectors.py) script that accepts a model (.pkl or .bin) and the desired output vocabulary (.txt).
+
+## CNN Experiment (October 2017)
+References:
+- Zhang et al., 2015. [Character-level Convolutional Networks for Text Classification](https://arxiv.org/abs/1509.01626).
+- dos Santos and Zadrozny, 2014. [Learning Character-level Representations for Part-of-Speech Tagging](http://proceedings.mlr.press/v32/santos14.pdf). Applied to POS tagging.
+- Shen et al., CIKM 2014. [A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval](https://dl.acm.org/citation.cfm?id=2661935). Extracted from word n-grams, applied to IR.
+- Kanaris et al., 2007. [Words Versus Character N-Grams For Anti-Spam Filtering](http://www.worldscientific.com/doi/abs/10.1142/S0218213007003692).
diff --git a/mimick/consts.py b/mimick/consts.py
@@ -0,0 +1,17 @@
+# REPRESENTATION
+POLYGLOT_UNK = unicode("<UNK>")
+PADDING_CHAR = "<*>"
+
+# MODEL PARAMS
+DEFAULT_CHAR_DIM = 20
+DEFAULT_HIDDEN_DIM = 50
+DEFAULT_WORD_DIM = 64
+
+# LSTM
+DEFAULT_LSTM_LAYERS = 1
+
+# CNN
+DEFAULT_WINDOW_WIDTH = 3
+DEFAULT_POOLING_MAXK = 1
+DEFAULT_STRIDE = 1
+DEFAULT_CNN_LAYERS = 1
diff --git a/mimick/inter_nearest_vecs.py b/mimick/inter_nearest_vecs.py
@@ -7,7 +7,9 @@
 import numpy as np
 import collections
 import argparse
-from model import LSTMMimick
+from model import LSTMMimick, CNNMimick
+from make_dataset import read_text_embs
+from consts import *
 
 __author__ = "Yuval Pinter, 2017"
 
@@ -22,19 +24,43 @@ def dist(v1, v2):
 if __name__ == "__main__":
     # parse command line arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mimick", required=True, dest="mimick", help="Mimick model file")
-    parser.add_argument("--c2i", required=True, dest="c2i", help="Mimick char-to-integer mapping file")
-    parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file with reference word vectors")
-    parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format")
-    #parser.add_argument("--ktop", dest="ktop", default=10, help="Number of top neighbors to present (optional)")
+    parser.add_argument("--mimick", required=True, help="Mimick model file")
+    parser.add_argument("--use-cnn", action="store_true", help="Use CNN model")
+    parser.add_argument("--c2i", required=True, help="Mimick char-to-integer mapping file")
+    parser.add_argument("--vectors", required=True, help="Pickle file with reference word vectors")
+    parser.add_argument("--w2v-format", action="store_true", help="Vector file is in textual w2v format")
+    parser.add_argument("--ktop", type=int, default=10, help="Number of top neighbors to present (optional)")
+    parser.add_argument("--char-dim", type=int, default=DEFAULT_CHAR_DIM, help="dimension for character embeddings (default = {})".format(DEFAULT_CHAR_DIM))
+    parser.add_argument("--hidden-dim", type=int, default=DEFAULT_HIDDEN_DIM, help="dimension for LSTM layers (default = {})".format(DEFAULT_HIDDEN_DIM))
+    ### LSTM ###
+    parser.add_argument("--num-lstm-layers", type=int, default=DEFAULT_LSTM_LAYERS, help="Number of LSTM layers (default = {})".format(DEFAULT_LSTM_LAYERS))
+    ### CNN  ###
+    parser.add_argument("--num-conv-layers", type=int, default=DEFAULT_CNN_LAYERS, help="Number of CNN layers (default = 1)")
+    parser.add_argument("--window-width", type=int, default=DEFAULT_WINDOW_WIDTH, help="Width of CNN layers (default = 3)")
+    parser.add_argument("--pooling-maxk", type=int, default=DEFAULT_POOLING_MAXK, help="K for K-max pooling (default = 1)")
+    parser.add_argument("--stride", dest="w_stride", default=DEFAULT_STRIDE, help="'Width' stride for CNN layers (default = 1)")
+
     opts = parser.parse_args()
 
+    # load vocab
+    if opts.w2v_format:
+        voc_words, voc_vecs = read_text_embs(opts.vectors)
+    else:
+        voc_words, voc_vecs = pickle.load(open(opts.vectors))
+
+    we_dim = len(voc_vecs[0])
+
     # load model
     c2i = pickle.load(open(opts.c2i))
-    mimick = LSTMMimick(c2i, file=opts.mimick)
-
-    # load vocab
-    voc_words, voc_vecs = pickle.load(open(opts.vectors))
+    if opts.use_cnn:
+        mimick = CNNMimick(c2i, num_conv_layers=opts.num_conv_layers, char_dim=opts.char_dim,\
+                           hidden_dim=opts.hidden_dim, window_width=opts.window_width,\
+                           pooling_maxk=opts.pooling_maxk, w_stride=opts.w_stride,\
+                           word_embedding_dim=we_dim, file=opts.mimick)
+    else:
+        mimick = LSTMMimick(c2i, num_lstm_layers=opts.num_lstm_layers, char_dim=opts.char_dim,\
+                            hidden_dim=opts.hidden_dim,\
+                            word_embedding_dim=we_dim, file=opts.mimick)
 
     # prompt
     while True:
@@ -45,5 +71,5 @@ def dist(v1, v2):
 
         word_chars = [c2i[c] for c in next_word]
         pred_vec = mimick.predict_emb(word_chars).value()
-        top_k = sorted([(iv, dist(iv_vec, pred_vec)) for iv,iv_vec in zip(voc_words, voc_vecs)], key=lambda x: x[1])[:10]
+        top_k = sorted([(iv, dist(iv_vec, pred_vec)) for iv,iv_vec in zip(voc_words, voc_vecs)], key=lambda x: x[1])[:opts.ktop]
         print '\n'.join(['{}:\t{:.3f}'.format(near[0], 1.0 - near[1]) for near in top_k])
diff --git a/mimick/make_dataset.py b/mimick/make_dataset.py
@@ -1,9 +1,10 @@
 '''
 Creates dataset for trained-embeddings prediction by character Bi-LSTM.
 Inputs:
-- A pre-trained embedding model that is to be emulated by character model
-- A set of downstream-task vocab words, those of which not present in the
+- A pre-trained embedding dictionary that is to be emulated by character model
+- An optional set of downstream-task vocab words, those of which not present in the
     pre-trained embeddings will be output by the character model
+    (only important for sanity statistics following model training)
 '''
 from __future__ import division
 from _collections import defaultdict
@@ -13,6 +14,8 @@
 import collections
 import numpy as np
 
+from util import charseq
+
 __author__ = "Yuval Pinter, 2017"
 
 POLYGLOT_UNK = unicode("<UNK>")
@@ -32,66 +35,65 @@ def read_text_embs(filename):
                 embs.append(np.array([float(s) for s in split[1:]]))
     return words, embs
 
-def charseq(word, c2i):
-    chars = []
-    for c in word:
-        if c not in c2i:
-            c2i[c] = len(c2i)
-        chars.append(c2i[c])
-    return chars
-
-# parse arguments
-parser = argparse.ArgumentParser()
-parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file from which to get target word vectors")
-parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format")
-parser.add_argument("--vocab", required=True, dest="vocab", help="File containing words for unlabeled test set")
-parser.add_argument("--output", required=True, dest="output", help="Output filename (.pkl)")
+if __name__ == "__main__":
+    # parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file from which to get target word vectors")
+    parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format")
+    parser.add_argument("--vocab", dest="vocab", help="File containing words for unlabeled test set (optional)")
+    parser.add_argument("--output", required=True, dest="output", help="Output filename (.pkl)")
 
-options = parser.parse_args()
+    options = parser.parse_args()
 
-c2i = {}
-training_instances = []
-test_instances = []
+    c2i = {}
+    training_instances = []
+    test_instances = []
 
-# Read in the output vocab
-with codecs.open(options.vocab, "r", "utf-8") as f:
-    vocab = set([ line.strip() for line in f ])
+    # Read in the output vocab
+    if options.vocab is None:
+        vocab = []
+    else:
+        with codecs.open(options.vocab, "r", "utf-8") as f:
+            vocab = set([ line.strip() for line in f ])
 
-# read embeddings file
-if options.w2v_format:
-    words, embs = read_text_embs(options.vectors)
-else:
-    words, embs = cPickle.load(open(options.vectors, "r"))
-dim = len(embs[0])
-word_to_ix = {w : i for (i,w) in enumerate(words)}
+    # read embeddings file
+    if options.w2v_format:
+        words, embs = read_text_embs(options.vectors)
+    else:
+        words, embs = cPickle.load(open(options.vectors, "r"))
+    dim = len(embs[0])
+    word_to_ix = {w : i for (i,w) in enumerate(words)}
 
-with codecs.open(options.output, "w", "utf-8") as outfile:
     in_vocab = 0
-    total = len(vocab)
     for word, emb in zip(words, embs):
         if word == POLYGLOT_UNK or word == W2V_UNK: continue
         if word in vocab:
             in_vocab += 1
         training_instances.append(Instance(charseq(word, c2i), emb))
     training_char_count = len(c2i)
-    for v in vocab:
-        if v not in words:
-            test_instances.append(Instance(charseq(v, c2i), np.array([0.0] * dim)))
-    print "Total Number of output words:", total
-    print "Total in Training Vocabulary:", in_vocab
-    print "Percentage in-vocab:", in_vocab / total
     print "Total in Embeddings vocabulary:", len(words)
     print "Training set character count: ", training_char_count
-    print "Total haracter count: ", len(c2i)
 
-c2i[PADDING_CHAR] = len(c2i)
+    # Test
+    if len(vocab) > 0:
+        in_vocab = 0
+        total = len(vocab)
+        for v in vocab:
+            if v not in words:
+                test_instances.append(Instance(charseq(v, c2i), np.array([0.0] * dim)))
+        print "Total Number of output words:", total
+        print "Total in Training Vocabulary:", in_vocab
+        print "Percentage in-vocab:", in_vocab / total
+        print "Total character count: ", len(c2i)
+
+    c2i[PADDING_CHAR] = len(c2i)
 
-# populate output
-output = {}
-output["c2i"] = c2i
-output["training_instances"] = training_instances
-output["test_instances"] = test_instances
+    # populate output
+    output = {}
+    output["c2i"] = c2i
+    output["training_instances"] = training_instances
+    output["test_instances"] = test_instances
 
-# write output
-with open(options.output, "w") as outfile:
-    cPickle.dump(output, outfile)
+    # write output
+    with open(options.output, "w") as outfile:
+        cPickle.dump(output, outfile)