\[ \nabla_\theta J(\pi_\theta) = \mathbb{E}_{\pi_\theta} \left[ \nabla_\theta \log \pi_\theta(a \mid s)\; v_\pi(s) \right]. \]
\[ G_t(\tau) = \sum_{k=0}^{T-1} \gamma^k R_{t+1+k}. \]
#| html-indent-size: "1.2em"
#| html-comment-delimiter: "//"
#| html-line-number: true
#| html-line-number-punc: ":"
#| html-no-end: false
#| pdf-placement: "htb!"
#| pdf-line-number: true
\begin{algorithm}
\caption{Monte Carlo Policy Gradient (REINFORCE)}
\begin{algorithmic}
\State Initialize $\alpha$
\State Initialize $\theta$
\For{episode = 0 \To MAX\_EPISODE}
\State Sample trajectory $\tau$ using $\pi_\theta$
\State $g \gets 0$
\For{$t = 0 \To T-1$}
\State Compute return $G_t$
\State $g \gets g + G_t\,\nabla_\theta \log \pi_\theta(a_t \mid s_t)$
\EndFor
\State $\theta \gets \theta + \alpha g$
\EndFor
\end{algorithmic}
\end{algorithm}