diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index db1778e..676fcae 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,42 +12,42 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: set up python
+      - name: Set up python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: 3.8
 
-      - name: set up env
+      - name: Set up env
         run: python -m pip install -e .[docs,dev]
 
-      - name: run black
+      - name: Run black
         run: black --check .
 
-      - name: run isort
+      - name: Run isort
         run: isort .
 
-      - name: run pylint for mdp folder
+      - name: Run pylint for mdp folder
         run: pylint src/behavior_generation_lecture_python/mdp --errors-only
 
-      - name: run mypy for mdp folder
+      - name: Run mypy for mdp folder
         run: mypy src/behavior_generation_lecture_python/mdp
 
-      - name: test
+      - name: Test
         run: |
           export DISPLAY=:99
           Xvfb :99 &
           pytest
 
-      - name: check coverage
+      - name: Check coverage
         run: |
           export DISPLAY=:99
           Xvfb :99 &
           pytest --cov=src --cov-fail-under=85
 
-      - name: copy notebooks to docs folder
+      - name: Copy notebooks to docs folder
         run: cp -r notebooks/* docs/notebooks
 
-      - name: build docs
+      - name: Build docs
         run: mkdocs build
 
   deploy-pages:
@@ -58,15 +58,15 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: set up python
+      - name: Set up python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: 3.8
 
-      - name: set up env
+      - name: Set up env
         run: python -m pip install -e .[docs]
 
-      - name: copy notebooks to docs folder
+      - name: Copy notebooks to docs folder
         run: cp -r notebooks/* docs/notebooks
 
       - run: mkdocs gh-deploy --force
diff --git a/mkdocs.yml b/mkdocs.yml
index e347015..7a77ab3 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -58,8 +58,6 @@ plugins:
           rendering:
             show_if_no_docstring: true
             show_signature_annotations: true
-      watch:
-        - src
   - gen-files:
       scripts:
         - docs/gen_ref_pages.py
diff --git a/notebooks/mdp_policy_gradient.ipynb b/notebooks/mdp_policy_gradient.ipynb
new file mode 100644
index 0000000..0759ff9
--- /dev/null
+++ b/notebooks/mdp_policy_gradient.ipynb
@@ -0,0 +1,250 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from behavior_generation_lecture_python.mdp.policy import CategorialPolicy\n",
+    "from behavior_generation_lecture_python.utils.grid_plotting import (\n",
+    "    make_plot_policy_step_function,\n",
+    ")\n",
+    "from behavior_generation_lecture_python.mdp.mdp import (\n",
+    "    GridMDP,\n",
+    "    policy_gradient,\n",
+    "    derive_deterministic_policy,\n",
+    "    GRID_MDP_DICT,\n",
+    "    HIGHWAY_MDP_DICT,\n",
+    "    LC_RIGHT_ACTION,\n",
+    "    STAY_IN_LANE_ACTION,\n",
+    ")\n",
+    "\n",
+    "HIGHWAY_MDP_DICT[\"restrict_actions_to_available_states\"] = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOY EXAMPLE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grid_mdp = GridMDP(**GRID_MDP_DICT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy = CategorialPolicy(\n",
+    "    sizes=[len(grid_mdp.initial_state), 32, len(grid_mdp.actions)],\n",
+    "    actions=list(grid_mdp.actions),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_checkpoints = policy_gradient(\n",
+    "    mdp=grid_mdp,\n",
+    "    policy=policy,\n",
+    "    iterations=100,\n",
+    "    return_history=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy_array = [\n",
+    "    derive_deterministic_policy(mdp=grid_mdp, policy=model)\n",
+    "    for model in model_checkpoints\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_policy_step_grid_map = make_plot_policy_step_function(\n",
+    "    columns=4, rows=3, policy_over_time=policy_array\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mkdocs_flag = True\n",
+    "if mkdocs_flag:\n",
+    "    import ipywidgets\n",
+    "    from IPython.display import display\n",
+    "\n",
+    "    iteration_slider = ipywidgets.IntSlider(\n",
+    "        min=0, max=len(model_checkpoints) - 1, step=1, value=0\n",
+    "    )\n",
+    "    w = ipywidgets.interactive(plot_policy_step_grid_map, iteration=iteration_slider)\n",
+    "    display(w)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_policy_step_grid_map(100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## HIGHWAY EXAMPLE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if False:\n",
+    "    # we will change this to true later on, to see the effect\n",
+    "    HIGHWAY_MDP_DICT[\"transition_probabilities_per_action\"][LC_RIGHT_ACTION] = [\n",
+    "        (0.4, LC_RIGHT_ACTION),\n",
+    "        (0.6, STAY_IN_LANE_ACTION),\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "highway_mdp = GridMDP(**HIGHWAY_MDP_DICT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy = CategorialPolicy(\n",
+    "    sizes=[len(highway_mdp.initial_state), 32, len(highway_mdp.actions)],\n",
+    "    actions=list(highway_mdp.actions),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_checkpoints = policy_gradient(\n",
+    "    mdp=highway_mdp,\n",
+    "    policy=policy,\n",
+    "    iterations=200,\n",
+    "    return_history=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "policy_array = [\n",
+    "    derive_deterministic_policy(mdp=highway_mdp, policy=model)\n",
+    "    for model in model_checkpoints\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_policy_step_grid_map = make_plot_policy_step_function(\n",
+    "    columns=10, rows=4, policy_over_time=policy_array\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if mkdocs_flag:\n",
+    "    import ipywidgets\n",
+    "    from IPython.display import display\n",
+    "\n",
+    "    iteration_slider = ipywidgets.IntSlider(\n",
+    "        min=0, max=len(model_checkpoints) - 1, step=1, value=0\n",
+    "    )\n",
+    "    w = ipywidgets.interactive(plot_policy_step_grid_map, iteration=iteration_slider)\n",
+    "    display(w)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_policy_step_grid_map(200)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.11.5 ('kit_vorlesung_tutorial')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "c55583abd569aed2a1a6538892df4383b19c955ebf68dd4bc0814f5cb22bab0c"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/pyproject.toml b/pyproject.toml
index 99f88cb..fe8169d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "behavior_generation_lecture_python"
 version = "0.0.2"
 description = "Python code for the respective lecture at KIT"
 readme = "README.md"
-requires-python = ">=3.7"
+requires-python = ">=3.8"
 license = {file = "LICENSE"}
 authors = [
   {name = "Organizers of the lecture 'Verhaltensgenerierung für Fahrzeuge' at KIT" }
@@ -17,7 +17,8 @@ dependencies = [
   "matplotlib>=2.2.4",
   "scipy",
   "jupyter",
-  "python-statemachine"
+  "python-statemachine",
+  "torch"
 ]
 
 [project.optional-dependencies]
diff --git a/src/behavior_generation_lecture_python/mdp/mdp.py b/src/behavior_generation_lecture_python/mdp/mdp.py
index 5879a74..cc495c3 100644
--- a/src/behavior_generation_lecture_python/mdp/mdp.py
+++ b/src/behavior_generation_lecture_python/mdp/mdp.py
@@ -1,7 +1,14 @@
+"""This module contains the Markov Decision Process, value iteration, Q learning and policy gradient."""
+
 import math
+from copy import deepcopy
+from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
+import torch
+
+from behavior_generation_lecture_python.mdp.policy import CategorialPolicy
 
 SIMPLE_MDP_DICT = {
     "states": [1, 2],
@@ -54,6 +61,8 @@
 
 
 class MDP:
+    """A Markov decision process."""
+
     def __init__(
         self,
         states: Set[Any],
@@ -117,9 +126,7 @@ def get_actions(self, state) -> Set[Any]:
         """Get the set of actions available in a certain state, returns [None] for terminal states."""
         if self.is_terminal(state):
             return {None}
-        return set(
-            [a for a in self.actions if (state, a) in self.transition_probabilities]
-        )
+        return {a for a in self.actions if (state, a) in self.transition_probabilities}
 
     def get_reward(self, state) -> float:
         """Get the reward for a specific state."""
@@ -150,11 +157,20 @@ def sample_next_state(self, state, action) -> Any:
         )
         return prob_per_transition[choice][1]
 
+    def execute_action(self, state, action) -> Tuple[Any, float, bool]:
+        """Executes the action in the current state and returns the new state, obtained reward and terminal flag."""
+        new_state = self.sample_next_state(state=state, action=action)
+        reward = self.get_reward(state=new_state)
+        terminal = self.is_terminal(state=new_state)
+        return new_state, reward, terminal
+
 
 GridState = Tuple[int, int]
 
 
 class GridMDP(MDP):
+    """A Markov decision process on a grid."""
+
     def __init__(
         self,
         grid: List[List[Union[float, None]]],
@@ -406,7 +422,7 @@ def greedy_value_estimate_for_state(*, q_table: QTable, state: Any) -> float:
     available_actions = [
         state_action[1] for state_action in q_table.keys() if state_action[0] == state
     ]
-    return max([q_table[(state, action)] for action in available_actions])
+    return max(q_table[state, action] for action in available_actions)
 
 
 def q_learning(
@@ -489,3 +505,168 @@ def q_learning(
         state: greedy_value_estimate_for_state(q_table=q_table, state=state)
         for state in mdp.get_states()
     }
+
+
+@dataclass
+class PolicyGradientBuffer:
+    """Buffer for the policy gradient method."""
+
+    states: List[Any] = field(default_factory=list)
+    actions: List[Any] = field(default_factory=list)
+    weights: List[float] = field(default_factory=list)
+    episode_returns: List[float] = field(default_factory=list)
+    episode_lengths: List[int] = field(default_factory=list)
+
+    def mean_episode_return(self) -> float:
+        """Mean episode return."""
+        return float(np.mean(self.episode_returns))
+
+    def mean_episode_length(self) -> float:
+        """Mean episode length."""
+        return float(np.mean(self.episode_lengths))
+
+
+def policy_gradient(
+    *,
+    mdp: MDP,
+    policy: CategorialPolicy,
+    lr: float = 1e-2,
+    iterations: int = 50,
+    batch_size: int = 5000,
+    return_history: bool = False,
+    use_random_init_state: bool = False,
+    verbose: bool = True,
+) -> Union[List[CategorialPolicy], CategorialPolicy]:
+    """Train a paramterized policy using vanilla policy gradient.
+
+    Adapted from: https://github.com/openai/spinningup/blob/master/spinup/examples/pytorch/pg_math/1_simple_pg.py
+
+    The MIT License (MIT)
+
+    Copyright (c) 2018 OpenAI (http://openai.com)
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Args:
+        mdp: The underlying MDP.
+        policy: The stochastic policy to be trained.
+        lr: Learning rate.
+        iterations: Number of iterations.
+        batch_size: Number of samples generated for each policy update.
+        return_history: Whether to return the whole history of value estimates
+            instead of just the final estimate.
+        use_random_init_state: bool, if the agent should be initialized randomly.
+        verbose: bool, if traing progress should be printed.
+
+    Returns:
+        The final policy, if return_history is false. The
+        history of policies as list, if return_history is true.
+    """
+    np.random.seed(1337)
+    torch.manual_seed(1337)
+
+    # add untrained model to model_checkpoints
+    model_checkpoints = [deepcopy(policy)]
+
+    # make optimizer
+    optimizer = torch.optim.Adam(policy.net.parameters(), lr=lr)
+
+    # get non-terminal states
+    non_terminal_states = [state for state in mdp.states if not mdp.is_terminal(state)]
+
+    # training loop
+    for i in range(1, iterations + 1):
+
+        # a buffer for storing intermediate values
+        buffer = PolicyGradientBuffer()
+
+        # reset episode-specific variables
+        if use_random_init_state:
+            state = non_terminal_states[np.random.choice(len(non_terminal_states))]
+        else:
+            state = mdp.initial_state
+        episode_rewards = []
+
+        # collect experience by acting in the mdp
+        while True:
+            # save visited state
+            buffer.states.append(deepcopy(state))
+
+            # call model to get next action
+            action = policy.get_action(state=torch.tensor(state, dtype=torch.float32))
+
+            # execute action in the environment
+            state, reward, done = mdp.execute_action(state=state, action=action)
+
+            # save action, reward
+            buffer.actions.append(action)
+            episode_rewards.append(reward)
+
+            if done:
+                # if episode is over, record info about episode
+                episode_return = sum(episode_rewards)
+                episode_length = len(episode_rewards)
+                buffer.episode_returns.append(episode_return)
+                buffer.episode_lengths.append(episode_length)
+                # the weight for each logprob(a|s) is R(tau)
+                buffer.weights += [episode_return] * episode_length
+
+                # reset episode-specific variables
+                if use_random_init_state:
+                    state = non_terminal_states[
+                        np.random.choice(len(non_terminal_states))
+                    ]
+                else:
+                    state = mdp.initial_state
+                episode_rewards = []
+
+                # end experience loop if we have enough of it
+                if len(buffer.states) > batch_size:
+                    break
+
+        # compute the loss
+        logp = policy.get_log_prob(
+            states=torch.tensor(buffer.states, dtype=torch.float),
+            actions=torch.tensor(buffer.actions, dtype=torch.long),
+        )
+        batch_loss = -(logp * torch.tensor(buffer.weights, dtype=torch.float)).mean()
+
+        # take a single policy gradient update step
+        optimizer.zero_grad()
+        batch_loss.backward()
+        optimizer.step()
+
+        # logging
+        if verbose:
+            print(
+                f"iteration: {i:3d};  return: {buffer.mean_episode_return():.3f};  episode_length: {buffer.mean_episode_length():.3f}"
+            )
+        if return_history:
+            model_checkpoints.append(deepcopy(policy))
+    if return_history:
+        return model_checkpoints
+    return policy
+
+
+def derive_deterministic_policy(mdp: MDP, policy: CategorialPolicy) -> Dict[Any, Any]:
+    """Compute the best policy for an MDP given the stochastic policy.
+
+    Args:
+        mdp: The underlying MDP.
+        policy: The stochastic policy.
+
+    Returns:
+        Deterministic policy, i.e. mapping from state to action.
+    """
+    pi = {}
+    for state in mdp.get_states():
+        if mdp.is_terminal(state):
+            continue
+        pi[state] = policy.get_action(
+            state=torch.as_tensor(state, dtype=torch.float32), deterministic=True
+        )
+    return pi
diff --git a/src/behavior_generation_lecture_python/mdp/policy.py b/src/behavior_generation_lecture_python/mdp/policy.py
new file mode 100644
index 0000000..28c7618
--- /dev/null
+++ b/src/behavior_generation_lecture_python/mdp/policy.py
@@ -0,0 +1,64 @@
+"""This module contains the CategoricalPolicy implementation."""
+
+from typing import List, Type
+
+import torch
+from torch import nn
+from torch.distributions.categorical import Categorical
+
+
+def multi_layer_perceptron(
+    sizes: List[int],
+    activation: Type[nn.Module] = nn.ReLU,
+    output_activation: Type[nn.Module] = nn.Identity,
+):
+    """Returns a multi-layer perceptron"""
+    mlp = nn.Sequential()
+    for i in range(len(sizes) - 1):
+        mlp.append(nn.Linear(sizes[i], sizes[i + 1]))
+        if i < len(sizes) - 2:
+            mlp.append(activation())
+        else:
+            mlp.append(output_activation())
+    return mlp
+
+
+class CategorialPolicy:
+    def __init__(self, sizes: List[int], actions: List):
+        assert sizes[-1] == len(actions)
+        torch.manual_seed(1337)
+        self.net = multi_layer_perceptron(sizes=sizes)
+        self.actions = actions
+        self._actions_tensor = torch.tensor(actions, dtype=torch.long).view(
+            len(actions), -1
+        )
+
+    def _get_distribution(self, state: torch.Tensor):
+        """Calls the model and returns a categorial distribution over the actions."""
+        logits = self.net(state)
+        return Categorical(logits=logits)
+
+    def get_action(self, state: torch.Tensor, deterministic: bool = False):
+        """Returns an action sample for the given state"""
+        policy = self._get_distribution(state)
+        if deterministic:
+            return self.actions[policy.mode.item()]
+        return self.actions[policy.sample().item()]
+
+    def get_log_prob(self, states: torch.Tensor, actions: torch.Tensor):
+        """Returns the log-probability for taking the action, when being the given state"""
+        return self._get_distribution(states).log_prob(
+            self._get_action_id_from_action(actions)
+        )
+
+    def _get_action_id_from_action(self, actions: torch.Tensor):
+        """Returns the indices of the passed actions in self.actions"""
+        reshaped_actions = actions.unsqueeze(1).expand(
+            -1, self._actions_tensor.size(0), -1
+        )
+        reshaped_actions_tensor = self._actions_tensor.unsqueeze(0).expand(
+            actions.size(0), -1, -1
+        )
+        return torch.where(
+            torch.all(reshaped_actions == reshaped_actions_tensor, dim=-1)
+        )[1]
diff --git a/tests/test_mdp.py b/tests/test_mdp.py
index 582026a..dde34d7 100644
--- a/tests/test_mdp.py
+++ b/tests/test_mdp.py
@@ -9,10 +9,12 @@
     derive_policy,
     expected_utility_of_action,
     greedy_value_estimate_for_state,
+    policy_gradient,
     q_learning,
     random_action,
     value_iteration,
 )
+from behavior_generation_lecture_python.mdp.policy import CategorialPolicy
 
 
 def test_init_mdp():
@@ -151,3 +153,20 @@ def test_q_learning(return_history):
         iterations=10000,
         return_history=return_history,
     )
+
+
+@pytest.mark.parametrize("return_history", (True, False))
+def test_policy_gradient(return_history):
+    mdp = GridMDP(**GRID_MDP_DICT)
+    pol = CategorialPolicy(
+        sizes=[len(mdp.initial_state), 32, len(mdp.actions)], actions=list(mdp.actions)
+    )
+    assert policy_gradient(
+        mdp=mdp,
+        policy=pol,
+        lr=1e2,
+        iterations=5,
+        batch_size=5000,
+        return_history=return_history,
+        verbose=False,
+    )