反向传播实现
2025年3月14日
16:09
class Value: value相当于Tensor类
""" stores a single scalar
value and its gradient """
def __init__(self, data, _children=(),
_op=''):
self.data = data
self.grad = 0
tensor需要存储梯度grad
# internal variables used for autograd
graph construction
self._backward =
lambda: None。
tensor需要定义backward函数
self._prev =
set(_children)。 tensor需要存储它的孩子节点
self._op = _op # the op that produced
this node, for graphviz / debugging / etc
def __add__(self, other):
other = other if isinstance(other,
Value) else Value(other)
out = Value(self.data + other.data,
(self, other), '+')
def _backward():
self.grad += out.grad
other.grad += out.grad
out._backward = _backward
return out
def __mul__(self, other):
每个加减乘除、tanh等操作都会生成新的tensor, 即return
out 中的out.
other = other if isinstance(other,
Value) else Value(other)
out = Value(self.data * other.data,
(self, other), '*')
def _backward():
每次生成新tensor的时候会定义backward函数,backward函数的功能是根据out tensor的梯度更新其所有children tensor的梯度
self.grad += other.data * out.grad
other.grad +=
self.data * out.grad 注意⚠️更新梯度是+=,不是=,因为一个子tensor可能会被多次更新,如图所示,比如x = a+a,tensor a会被更新两次。也正是因为是+=,所以每个batch backward之前需要zerograd, 否则会一直累加。(至于为什么可以简单地累加,我是理解是当learning rate足够小,可以用累加的方式近似。)
⚠️孩子tensor的梯度= out tensor对该孩子tensor的梯度 * out tensor的梯度(链式法则),即
self.grad += other.data * out.grad
out._backward = _backward
return out
def __pow__(self, other):
assert isinstance(other, (int, float)),
"only supporting int/float powers for now"
out = Value(self.data**other, (self,),
f'**{other}')
def _backward():
self.grad += (other *
self.data**(other-1)) * out.grad
out._backward = _backward
return out
def relu(self):
out = Value(0 if self.data < 0 else
self.data, (self,), 'ReLU')
def _backward():
self.grad += (out.data > 0) *
out.grad
out._backward = _backward
return out
def backward(self):
# topological order
all of the children in the graph # 对计算图中的所有tensor节点做拓扑排序
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
# go one variable at a time and apply
the chain rule to get its gradient
self.grad = 1
for v in
reversed(topo): # 对拓扑排序好的节点,从后向前遍历tensor节点,调用该节点的backward函数。
v._backward()
def __neg__(self): # -self
return self * -1
def __radd__(self, other): # other + self
return self + other
def __sub__(self, other): # self - other
return self + (-other)
def __rsub__(self, other): # other - self
return other + (-self)
def __rmul__(self, other): # other * self
return self * other
def __truediv__(self, other): # self /
other
return self * other**-1
def __rtruediv__(self, other): # other /
self
return other * self**-1
def __repr__(self):
return f"Value(data={self.data},
grad={self.grad})"
已使用 OneNote 创建。