-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
98 lines (75 loc) · 3.58 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from autodiff import gradients, assign
def find_topo_sort(node_list):
"""Given a list of nodes, return a topological sort list of nodes ending in them.
A simple algorithm is to do a post-order DFS traversal on the given nodes,
going backwards based on input edges. Since a node is added to the ordering
after all its predecessors are traversed due to post-order DFS, we get a topological
sort.
"""
visited = set()
topo_order = []
for node in node_list:
topo_sort_dfs(node, visited, topo_order)
return topo_order
def topo_sort_dfs(node, visited, topo_order):
"""Post-order DFS"""
if node in visited:
return
visited.add(node)
for n in node.inputs:
topo_sort_dfs(n, visited, topo_order)
topo_order.append(node)
class GradientDescentOptimizer():
def __init__(self, learning_rate):
self.learning_rate = learning_rate
def minimize(self, cost):
trainable_vars = self.find_trainable_vars(cost)
print("trainable_vars ", trainable_vars)
grad_list = gradients(cost, trainable_vars)
assert len(trainable_vars) == len(grad_list)
train_steps = []
for var, var_grad in zip(trainable_vars, grad_list):
train_steps.append(assign(var, var - self.learning_rate * var_grad))
return train_steps
def find_trainable_vars(self, cost):
""" finds the trainable vars by doing modified DFS where we explore the
non-trainable nodes first followed by trainable nodes. Or in other words,
in the returned list, trainable wts which lies further from the root will be followed by
trainable wts which are nearer to root. The list of gradients will depend
on this list and we want to flow the incoming gradient (or apply gradients) to trainable wts
of greater depth first and then to trainable wts of shallow depth because
gradient of trainable wts of shallow depth depends on incoming gradient
and some matrix multiply result of lower wts.
But gradient of trainable wts of greater depth depends directly on
wts of shallow depth, so we can not modify the wts of shallow depth before
wts of greater depth when we update the wts in the training.
So for e.g. suppose we have following cost function:
W, W1 = some trainable wts
x, labels = ad.placeholder(name = "x"), ad.placeholder(name = "labels")
matmul = ad.matmul(W, x)
matmul1 = ad.matmul(W1, matmul)
cost = ad.reduce_mean( ad.softmax_with_cross_entropy(matmul1, labels) )
W1 is at lesser depth than W in the cost tree. W lies at bottom. So gradient flow has
to be done first to W and then to W1.
So in this case topo_order will be [W, W1]
"""
visited = set()
topo_order = []
self.topo_sort_dfs_m(cost, visited, topo_order)
return topo_order
def topo_sort_dfs_m(self, node, visited, topo_order):
"""Post-order DFS"""
if node in visited:
return
visited.add(node)
for n in self.reorder_nodes(node.inputs):
self.topo_sort_dfs_m(n, visited, topo_order)
if node.trainable:
topo_order.append(node)
def reorder_nodes(self, nodes):
""" reorder the nodes for DFS. first accumulate all the non trainable
variables and then append it to all trainable variables. """
node_list = [node for node in nodes if not node.trainable]
trainable_vars = [node for node in nodes if node.trainable]
node_list.extend(trainable_vars)
return node_list