Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CNN-Mimick and DyNet 2.0 #2

Merged
merged 20 commits into from
Nov 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,10 @@ The root directory of this repository contains the code required to perform extr

The entry point is [model.py](model.py), which can use tagging datasets created using the [make_dataset.py](make_dataset.py) script.
Note that `model.py` accepts pre-trained Word Embedding models via **text files** with no header. For Mimick models, this exact format is output into the path in [mimick/model.py](mimick/model.py) script's `--output` argument. For Word2Vec, FastText, or Polyglot models, one can create such a file using the [scripts/output_word_vectors.py](scripts/output_word_vectors.py) script that accepts a model (.pkl or .bin) and the desired output vocabulary (.txt).

## CNN Experiment (October 2017)
References:
- Zhang et al., 2015. [Character-level Convolutional Networks for Text Classification](https://arxiv.org/abs/1509.01626).
- dos Santos and Zadrozny, 2014. [Learning Character-level Representations for Part-of-Speech Tagging](http://proceedings.mlr.press/v32/santos14.pdf). Applied to POS tagging.
- Shen et al., CIKM 2014. [A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval](https://dl.acm.org/citation.cfm?id=2661935). Extracted from word n-grams, applied to IR.
- Kanaris et al., 2007. [Words Versus Character N-Grams For Anti-Spam Filtering](http://www.worldscientific.com/doi/abs/10.1142/S0218213007003692).
17 changes: 17 additions & 0 deletions mimick/consts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# REPRESENTATION
POLYGLOT_UNK = unicode("<UNK>")
PADDING_CHAR = "<*>"

# MODEL PARAMS
DEFAULT_CHAR_DIM = 20
DEFAULT_HIDDEN_DIM = 50
DEFAULT_WORD_DIM = 64

# LSTM
DEFAULT_LSTM_LAYERS = 1

# CNN
DEFAULT_WINDOW_WIDTH = 3
DEFAULT_POOLING_MAXK = 1
DEFAULT_STRIDE = 1
DEFAULT_CNN_LAYERS = 1
48 changes: 37 additions & 11 deletions mimick/inter_nearest_vecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import numpy as np
import collections
import argparse
from model import LSTMMimick
from model import LSTMMimick, CNNMimick
from make_dataset import read_text_embs
from consts import *

__author__ = "Yuval Pinter, 2017"

Expand All @@ -22,19 +24,43 @@ def dist(v1, v2):
if __name__ == "__main__":
# parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--mimick", required=True, dest="mimick", help="Mimick model file")
parser.add_argument("--c2i", required=True, dest="c2i", help="Mimick char-to-integer mapping file")
parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file with reference word vectors")
parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format")
#parser.add_argument("--ktop", dest="ktop", default=10, help="Number of top neighbors to present (optional)")
parser.add_argument("--mimick", required=True, help="Mimick model file")
parser.add_argument("--use-cnn", action="store_true", help="Use CNN model")
parser.add_argument("--c2i", required=True, help="Mimick char-to-integer mapping file")
parser.add_argument("--vectors", required=True, help="Pickle file with reference word vectors")
parser.add_argument("--w2v-format", action="store_true", help="Vector file is in textual w2v format")
parser.add_argument("--ktop", type=int, default=10, help="Number of top neighbors to present (optional)")
parser.add_argument("--char-dim", type=int, default=DEFAULT_CHAR_DIM, help="dimension for character embeddings (default = {})".format(DEFAULT_CHAR_DIM))
parser.add_argument("--hidden-dim", type=int, default=DEFAULT_HIDDEN_DIM, help="dimension for LSTM layers (default = {})".format(DEFAULT_HIDDEN_DIM))
### LSTM ###
parser.add_argument("--num-lstm-layers", type=int, default=DEFAULT_LSTM_LAYERS, help="Number of LSTM layers (default = {})".format(DEFAULT_LSTM_LAYERS))
### CNN ###
parser.add_argument("--num-conv-layers", type=int, default=DEFAULT_CNN_LAYERS, help="Number of CNN layers (default = 1)")
parser.add_argument("--window-width", type=int, default=DEFAULT_WINDOW_WIDTH, help="Width of CNN layers (default = 3)")
parser.add_argument("--pooling-maxk", type=int, default=DEFAULT_POOLING_MAXK, help="K for K-max pooling (default = 1)")
parser.add_argument("--stride", dest="w_stride", default=DEFAULT_STRIDE, help="'Width' stride for CNN layers (default = 1)")

opts = parser.parse_args()

# load vocab
if opts.w2v_format:
voc_words, voc_vecs = read_text_embs(opts.vectors)
else:
voc_words, voc_vecs = pickle.load(open(opts.vectors))

we_dim = len(voc_vecs[0])

# load model
c2i = pickle.load(open(opts.c2i))
mimick = LSTMMimick(c2i, file=opts.mimick)

# load vocab
voc_words, voc_vecs = pickle.load(open(opts.vectors))
if opts.use_cnn:
mimick = CNNMimick(c2i, num_conv_layers=opts.num_conv_layers, char_dim=opts.char_dim,\
hidden_dim=opts.hidden_dim, window_width=opts.window_width,\
pooling_maxk=opts.pooling_maxk, w_stride=opts.w_stride,\
word_embedding_dim=we_dim, file=opts.mimick)
else:
mimick = LSTMMimick(c2i, num_lstm_layers=opts.num_lstm_layers, char_dim=opts.char_dim,\
hidden_dim=opts.hidden_dim,\
word_embedding_dim=we_dim, file=opts.mimick)

# prompt
while True:
Expand All @@ -45,5 +71,5 @@ def dist(v1, v2):

word_chars = [c2i[c] for c in next_word]
pred_vec = mimick.predict_emb(word_chars).value()
top_k = sorted([(iv, dist(iv_vec, pred_vec)) for iv,iv_vec in zip(voc_words, voc_vecs)], key=lambda x: x[1])[:10]
top_k = sorted([(iv, dist(iv_vec, pred_vec)) for iv,iv_vec in zip(voc_words, voc_vecs)], key=lambda x: x[1])[:opts.ktop]
print '\n'.join(['{}:\t{:.3f}'.format(near[0], 1.0 - near[1]) for near in top_k])
98 changes: 50 additions & 48 deletions mimick/make_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
'''
Creates dataset for trained-embeddings prediction by character Bi-LSTM.
Inputs:
- A pre-trained embedding model that is to be emulated by character model
- A set of downstream-task vocab words, those of which not present in the
- A pre-trained embedding dictionary that is to be emulated by character model
- An optional set of downstream-task vocab words, those of which not present in the
pre-trained embeddings will be output by the character model
(only important for sanity statistics following model training)
'''
from __future__ import division
from _collections import defaultdict
Expand All @@ -13,6 +14,8 @@
import collections
import numpy as np

from util import charseq

__author__ = "Yuval Pinter, 2017"

POLYGLOT_UNK = unicode("<UNK>")
Expand All @@ -32,66 +35,65 @@ def read_text_embs(filename):
embs.append(np.array([float(s) for s in split[1:]]))
return words, embs

def charseq(word, c2i):
chars = []
for c in word:
if c not in c2i:
c2i[c] = len(c2i)
chars.append(c2i[c])
return chars

# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file from which to get target word vectors")
parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format")
parser.add_argument("--vocab", required=True, dest="vocab", help="File containing words for unlabeled test set")
parser.add_argument("--output", required=True, dest="output", help="Output filename (.pkl)")
if __name__ == "__main__":
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--vectors", required=True, dest="vectors", help="Pickle file from which to get target word vectors")
parser.add_argument("--w2v-format", dest="w2v_format", action="store_true", help="Vector file is in textual w2v format")
parser.add_argument("--vocab", dest="vocab", help="File containing words for unlabeled test set (optional)")
parser.add_argument("--output", required=True, dest="output", help="Output filename (.pkl)")

options = parser.parse_args()
options = parser.parse_args()

c2i = {}
training_instances = []
test_instances = []
c2i = {}
training_instances = []
test_instances = []

# Read in the output vocab
with codecs.open(options.vocab, "r", "utf-8") as f:
vocab = set([ line.strip() for line in f ])
# Read in the output vocab
if options.vocab is None:
vocab = []
else:
with codecs.open(options.vocab, "r", "utf-8") as f:
vocab = set([ line.strip() for line in f ])

# read embeddings file
if options.w2v_format:
words, embs = read_text_embs(options.vectors)
else:
words, embs = cPickle.load(open(options.vectors, "r"))
dim = len(embs[0])
word_to_ix = {w : i for (i,w) in enumerate(words)}
# read embeddings file
if options.w2v_format:
words, embs = read_text_embs(options.vectors)
else:
words, embs = cPickle.load(open(options.vectors, "r"))
dim = len(embs[0])
word_to_ix = {w : i for (i,w) in enumerate(words)}

with codecs.open(options.output, "w", "utf-8") as outfile:
in_vocab = 0
total = len(vocab)
for word, emb in zip(words, embs):
if word == POLYGLOT_UNK or word == W2V_UNK: continue
if word in vocab:
in_vocab += 1
training_instances.append(Instance(charseq(word, c2i), emb))
training_char_count = len(c2i)
for v in vocab:
if v not in words:
test_instances.append(Instance(charseq(v, c2i), np.array([0.0] * dim)))
print "Total Number of output words:", total
print "Total in Training Vocabulary:", in_vocab
print "Percentage in-vocab:", in_vocab / total
print "Total in Embeddings vocabulary:", len(words)
print "Training set character count: ", training_char_count
print "Total haracter count: ", len(c2i)

c2i[PADDING_CHAR] = len(c2i)
# Test
if len(vocab) > 0:
in_vocab = 0
total = len(vocab)
for v in vocab:
if v not in words:
test_instances.append(Instance(charseq(v, c2i), np.array([0.0] * dim)))
print "Total Number of output words:", total
print "Total in Training Vocabulary:", in_vocab
print "Percentage in-vocab:", in_vocab / total
print "Total character count: ", len(c2i)

c2i[PADDING_CHAR] = len(c2i)

# populate output
output = {}
output["c2i"] = c2i
output["training_instances"] = training_instances
output["test_instances"] = test_instances
# populate output
output = {}
output["c2i"] = c2i
output["training_instances"] = training_instances
output["test_instances"] = test_instances

# write output
with open(options.output, "w") as outfile:
cPickle.dump(output, outfile)
# write output
with open(options.output, "w") as outfile:
cPickle.dump(output, outfile)
Loading