反向传播实现

2025年3月14日

16:09

class Value: value相当于Tensor类

""" stores a single scalar value and its gradient """

def __init__(self, data, _children=(), _op=''):

self.data = data

self.grad = 0 tensor需要存储梯度grad

# internal variables used for autograd graph construction

self._backward = lambda: None。 tensor需要定义backward函数

self._prev = set(_children)。 tensor需要存储它的孩子节点

self._op = _op # the op that produced this node, for graphviz / debugging / etc

def __add__(self, other):

other = other if isinstance(other, Value) else Value(other)

out = Value(self.data + other.data, (self, other), '+')

def _backward():

self.grad += out.grad

other.grad += out.grad

out._backward = _backward

return out

def __mul__(self, other): 每个加减乘除、tanh等操作都会生成新的tensor, 即return out 中的out.

other = other if isinstance(other, Value) else Value(other)

out = Value(self.data * other.data, (self, other), '*')

def _backward(): 每次生成新tensor的时候会定义backward函数，backward函数的功能是根据out tensor的梯度更新其所有children tensor的梯度

self.grad += other.data * out.grad

other.grad += self.data * out.grad 注意⚠️更新梯度是+=，不是=,因为一个子tensor可能会被多次更新，如图所示，比如x = a+a，tensor a会被更新两次。也正是因为是+=，所以每个batch backward之前需要zerograd，否则会一直累加。（至于为什么可以简单地累加，我是理解是当learning rate足够小，可以用累加的方式近似。）

⚠️孩子tensor的梯度= out tensor对该孩子tensor的梯度 * out tensor的梯度(链式法则)，即

self.grad += other.data * out.grad

out._backward = _backward

return out

def __pow__(self, other):

assert isinstance(other, (int, float)), "only supporting int/float powers for now"

out = Value(self.data**other, (self,), f'**{other}')

def _backward():

self.grad += (other * self.data**(other-1)) * out.grad

out._backward = _backward

return out

def relu(self):

out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

def _backward():

self.grad += (out.data > 0) * out.grad

out._backward = _backward

return out

def backward(self):

# topological order all of the children in the graph # 对计算图中的所有tensor节点做拓扑排序

topo = []

visited = set()

def build_topo(v):

if v not in visited:

visited.add(v)

for child in v._prev:

build_topo(child)

topo.append(v)

build_topo(self)

# go one variable at a time and apply the chain rule to get its gradient

self.grad = 1

for v in reversed(topo): # 对拓扑排序好的节点，从后向前遍历tensor节点，调用该节点的backward函数。

v._backward()

def __neg__(self): # -self

return self * -1

def __radd__(self, other): # other + self

return self + other

def __sub__(self, other): # self - other

return self + (-other)

def __rsub__(self, other): # other - self

return other + (-self)

def __rmul__(self, other): # other * self

return self * other

def __truediv__(self, other): # self / other

return self * other**-1

def __rtruediv__(self, other): # other / self

return other * self**-1

def __repr__(self):

return f"Value(data={self.data}, grad={self.grad})"