反向传播实现

2025314

16:09

 

class Value:   value相当于Tensor

    """ stores a single scalar value and its gradient """

 

    def __init__(self, data, _children=(), _op=''):

        self.data = data

        self.grad = 0       tensor需要存储梯度grad

        # internal variables used for autograd graph construction

        self._backward = lambda: None       tensor需要定义backward函数

        self._prev = set(_children)            tensor需要存储它的孩子节点

        self._op = _op # the op that produced this node, for graphviz / debugging / etc

 

    def __add__(self, other):

        other = other if isinstance(other, Value) else Value(other)

        out = Value(self.data + other.data, (self, other), '+')

 

        def _backward():

            self.grad += out.grad

            other.grad += out.grad

        out._backward = _backward

 

        return out

 

    def __mul__(self, other):   每个加减乘除、tanh等操作都会生成新的tensor, return out 中的out.

        other = other if isinstance(other, Value) else Value(other)

        out = Value(self.data * other.data, (self, other), '*')

 

        def _backward():   每次生成新tensor的时候会定义backward函数,backward函数的功能是根据out tensor的梯度更新其所有children tensor的梯度

            self.grad += other.data * out.grad

            other.grad += self.data * out.grad  注意⚠️更新梯度是+=,不是=,因为一个子tensor可能会被多次更新,如图所示,比如x = a+atensor a会被更新两次。也正是因为是+=,所以每个batch backward之前需要zerograd, 否则会一直累加。(至于为什么可以简单地累加,我是理解是当learning rate足够小,可以用累加的方式近似。)

                                                                                                ⚠️孩子tensor的梯度= out tensor对该孩子tensor的梯度 * out tensor的梯度(链式法则),即

                          self.grad += other.data * out.grad

        out._backward = _backward

 

        return out

 

    def __pow__(self, other):

        assert isinstance(other, (int, float)), "only supporting int/float powers for now"

        out = Value(self.data**other, (self,), f'**{other}')

 

        def _backward():

            self.grad += (other * self.data**(other-1)) * out.grad

        out._backward = _backward

 

        return out

 

    def relu(self):

        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

 

        def _backward():

            self.grad += (out.data > 0) * out.grad

        out._backward = _backward

 

        return out

 

    def backward(self):

 

        # topological order all of the children in the graph   # 对计算图中的所有tensor节点做拓扑排序

        topo = []

        visited = set()

        def build_topo(v):

            if v not in visited:

                visited.add(v)

                for child in v._prev:

                    build_topo(child)

                topo.append(v)

        build_topo(self)

 

        # go one variable at a time and apply the chain rule to get its gradient

        self.grad = 1

        for v in reversed(topo):      # 对拓扑排序好的节点,从后向前遍历tensor节点,调用该节点的backward函数。

            v._backward()

 

    def __neg__(self): # -self

        return self * -1

 

    def __radd__(self, other): # other + self

        return self + other

 

    def __sub__(self, other): # self - other

        return self + (-other)

 

    def __rsub__(self, other): # other - self

        return other + (-self)

 

    def __rmul__(self, other): # other * self

        return self * other

 

    def __truediv__(self, other): # self / other

        return self * other**-1

 

    def __rtruediv__(self, other): # other / self

        return other * self**-1

 

    def __repr__(self):

        return f"Value(data={self.data}, grad={self.grad})"

 

已使用 OneNote 创建。