diff --git a/CHANGELOG.md b/CHANGELOG.md
index 470aea3..426cef8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,20 @@ Version numbers: major.minor.patch
 * Minor version bump indicates a change in functionality that may affect users.
 * Patch version bump indicates bug-fixes or minor improvements not expected to affect users.
 
+## v5.0.0
+* Based on pytorch version 1.2
+* Improved training stability: gradient capping and warm-up
+* Merged mod-base and canonical entry points
+  * Custom model definitions should now take an
+    `alphabet_info` argument rather than `outsize`
+* Improved RNA support: tools can reverse references and basecalls
+* Basecaller changes:
+  * chunk size argument now matches guppy
+  * CPU calling enabled
+  * lower memory usage
+* Multi-GPU training enabled
+* Bug fixes
+
 ## v4.1.0
 * Ab initio ("bootstrap") training of models
 
@@ -16,7 +30,6 @@ Version numbers: major.minor.patch
 * Training walk-through
 * Tweaks to optimisation parameters
 
-
 ## v3.0.2
 * Improved training parameters
 * Use orthonormal initialisation of starting weights
diff --git a/Makefile b/Makefile
index 5cabcbf..e41ac56 100644
--- a/Makefile
+++ b/Makefile
@@ -11,19 +11,17 @@ CUDA ?= $(shell (which nvcc && nvcc --version) | grep -oP "(?<=release )[0-9.]+"
 
 
 # Determine correct torch package to install
-TORCH_CUDA_8.0 = cu80
-TORCH_CUDA_9.0 = cu90
+TORCH_CUDA_9.2 = cu92
 TORCH_CUDA_10.0 = cu100
 TORCH_PLATFORM ?= $(if $(TORCH_CUDA_$(CUDA)),$(TORCH_CUDA_$(CUDA)),cpu)
 PY3_MINOR = $(shell $(PYTHON) -c "import sys; print(sys.version_info.minor)")
-TORCH_Linux = http://download.pytorch.org/whl/${TORCH_PLATFORM}/torch-1.0.0-cp3${PY3_MINOR}-cp3${PY3_MINOR}m-linux_x86_64.whl
+TORCH_Linux = http://download.pytorch.org/whl/${TORCH_PLATFORM}/torch-1.2.0-cp3${PY3_MINOR}-cp3${PY3_MINOR}m-manylinux1_x86_64.whl
 TORCH_Darwin = torch
 TORCH ?= $(TORCH_$(shell uname -s))
 
 
 # determine correct cupy package to install
-CUPY_8.0 = cupy-cuda80
-CUPY_9.0 = cupy-cuda90
+CUPY_9.2 = cupy-cuda92
 CUPY_10.0 = cupy-cuda100
 CUPY ?= $(CUPY_$(CUDA))
 
diff --git a/README.md b/README.md
index e7d286e..a7d3062 100644
--- a/README.md
+++ b/README.md
@@ -31,9 +31,9 @@ expect to get your hands dirty.
 # Contents
 
 1. [Installing system prerequisites](#installing-system-prerequisites)
-2. [Installing Taiyaki](#installation)
+2. [Installing Taiyaki](#installing-taiyaki)
 3. [Tests](#tests)
-4. [Walk through](#walk-through)
+4. [Walk through](#walk-throughs-and-further-documentation)
 5. [Workflows](#workflows)<br>
         * [Using the workflow Makefile](#using-the-workflow-makefile)<br>
         * [Steps from fast5 files to basecalling](#steps-from-fast5-files-to-basecalling)<br>
@@ -86,7 +86,7 @@ Windows is not supported.
 If you intend to use Taiyaki with a GPU, make sure you have installed and set up [CUDA](#cuda) before proceeding.
 ---
 
-## Install Taiyaki in a new virtual environment
+## Install Taiyaki in a new virtual environment (RECOMMENDED)
 
 We recommend installing Taiyaki in a self-contained [virtual environment](https://docs.python.org/3/tutorial/venv.html).
 
@@ -99,6 +99,9 @@ You will need to run `source venv/bin/activate` at the start of each session whe
 
 ## Install Taiyaki system-wide or into activated Python environment
 
+This is not the recommended installation method: we recommend that you install taiyaki in its
+[own virtual environment](#install-taiyaki-in-a-new-virtual-environment) if possible.
+
 Taiyaki can be installed from source using either:
 
     python3 setup.py install
@@ -111,14 +114,13 @@ Alternatively, you can use pip with either:
 
 # Tests
 
-Tests can be run as follows:
-
-    make workflow           #runs scripts which carry out the workflow for basecall-network training and for squiggle-predictor training
-    make acctest            #runs acceptance tests
-    make unittest           #runs unit tests
-    make multiGPU_test      #runs multi-GPU test (GPUs 0 and 1 must be available, and CUDA must be installed - see below)
+Tests can be run as follows, provided that the recommended `make install` installation method was used:
 
-If Taiyaki has been installed in a virtual environment, it will have to activated before running tests: `source venv/bin/activate`.  To deactivate, run `deactivate`.
+    source venv/bin/activate   # activates taiyaki virtual environment (do this first)
+    make workflow              # runs scripts which carry out the workflow for basecall-network training and for squiggle-predictor training
+    make acctest               # runs acceptance tests
+    make unittest              # runs unit tests
+    make multiGPU_test         # runs multi-GPU test (GPUs 0 and 1 must be available, and CUDA must be installed - see below)
 
 # Walk throughs and further documentation
 For a walk-through of Taiyaki model training, including how to obtain sample training data, see [docs/walkthrough.rst](docs/walkthrough.rst).
diff --git a/bin/basecall.py b/bin/basecall.py
index 6dd8cfd..610d96c 100755
--- a/bin/basecall.py
+++ b/bin/basecall.py
@@ -30,21 +30,21 @@
 
 add_common_command_args(parser, 'alphabet device input_folder input_strand_list limit output quiet recursive version'.split())
 
-parser.add_argument("--chunk_size", type=Positive(int),
+parser.add_argument("--chunk_size", type=Positive(int), metavar="blocks",
                     default=basecall_helpers._DEFAULT_CHUNK_SIZE,
-                    help="Size of signal chunks sent to GPU")
+                    help="Size of signal chunks sent to GPU is chunk_size * model stride")
 parser.add_argument("--max_concurrent_chunks", type=Positive(int),
                     default=128, help="Maximum number of chunks to call at "
                     "once. Lower values will consume less (GPU) RAM.")
-parser.add_argument("--modified_base_output", action=FileAbsent, default=None,
+parser.add_argument("--modified_base_output", action=FileAbsent, default=None, metavar="mod_basecalls.hdf5",
                     help="Output filename for modified base output.")
-parser.add_argument("--overlap", type=NonNegative(int),
+parser.add_argument("--overlap", type=NonNegative(int), metavar="blocks",
                     default=basecall_helpers._DEFAULT_OVERLAP,
                     help="Overlap between signal chunks sent to GPU")
 parser.add_argument('--reverse', default=False, action=AutoBool,
                     help='Reverse sequences in output')
 parser.add_argument('--scaling', action=FileExists, default=None,
-                    help='Per-read scaling params')
+                    help='Path to TSV containing per-read scaling params')
 parser.add_argument("model", action=FileExists,
                     help="Model checkpoint file to use for basecalling")
 
diff --git a/bin/train_abinitio.py b/bin/train_abinitio.py
index 9b4f143..b5447cf 100755
--- a/bin/train_abinitio.py
+++ b/bin/train_abinitio.py
@@ -138,7 +138,6 @@ def save_model(network, outdir, index=None):
 
 
     for i in range(args.niteration):
-        lr_scheduler.step()
 
         idx = np.random.randint(len(chunks), size=args.batch_size)
         indata = chunks[idx].transpose(1, 0)
@@ -186,4 +185,7 @@ def save_model(network, outdir, index=None):
             total_samples = 0
             t0 = tn
 
+        lr_scheduler.step()
+
+
     save_model(network, args.outdir)
diff --git a/bin/train_flipflop.py b/bin/train_flipflop.py
index be4a17c..1d4f6bd 100755
--- a/bin/train_flipflop.py
+++ b/bin/train_flipflop.py
@@ -419,8 +419,6 @@ def main():
 
     for i in range(args.niteration):
 
-        lr_scheduler.step()
-
         # Chunk length is chosen randomly in the range given but forced to
         # be a multiple of the stride
         batch_chunk_len = (np.random.randint(
@@ -520,6 +518,9 @@ def main():
             #    log.write("* GPU{} params:".format(args.local_rank))
             #log.write("{}...{}\n".format(v,u))
 
+        lr_scheduler.step()
+
+
     if is_lead_process:
         helpers.save_model(network, args.outdir,
                            model_skeleton=network_save_skeleton)
diff --git a/bin/train_squiggle.py b/bin/train_squiggle.py
index f2235b7..450c1d7 100755
--- a/bin/train_squiggle.py
+++ b/bin/train_squiggle.py
@@ -129,7 +129,6 @@ def main():
     total_chunks = 0
 
     for i in range(args.niteration):
-        lr_scheduler.step()
         # If the logging threshold is 0 then we log all chunks, including those rejected, so pass the log
         # object into assemble_batch
         # chunk_batch is a list of dicts.
@@ -194,6 +193,7 @@ def main():
                 log.write("  {:.1%} chunks filtered".format(n_fail / n_tot))
             log.write("\n")
 
+        lr_scheduler.step()
 
     helpers.save_model(conv_net, args.outdir)
 
diff --git a/requirements.txt b/requirements.txt
index d342149..62e54e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,4 @@ ont_fast5_api == 1.2.0
 pysam >= 0.15.0
 matplotlib >= 2.0.0
 scipy >= 1
-torch >= 1, < 1.1
+torch == 1.2
diff --git a/setup.py b/setup.py
index 4b8a0cc..3b201ad 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
     "matplotlib >= 2.0.0",
     "pysam >= 0.15.0",
     "scipy >= 1",
-    "torch >= 1, < 1.1",
+    "torch == 1.2"
 ]
 
 
diff --git a/taiyaki/__init__.py b/taiyaki/__init__.py
index 8eb964f..7ccd5b9 100644
--- a/taiyaki/__init__.py
+++ b/taiyaki/__init__.py
@@ -1,7 +1,7 @@
 """Custard owns my heart!"""
 __version_info__ = {
-    'major': 4,
-    'minor': 1,
+    'major': 5,
+    'minor': 0,
     'revision': 0,
 }
 __version__ = "{major}.{minor}.{revision}".format(**__version_info__)
diff --git a/taiyaki/layers.py b/taiyaki/layers.py
index 14855e6..9c63c0e 100755
--- a/taiyaki/layers.py
+++ b/taiyaki/layers.py
@@ -19,7 +19,8 @@
 def init_(param, value):
     """Set parameter value (inplace) from tensor, numpy array, list or tuple"""
     value_as_tensor = torch.tensor(value, dtype=param.data.dtype)
-    param.data.detach_().set_(value_as_tensor)
+    with torch.no_grad():
+        param.set_(value_as_tensor)
 
 
 def random_orthonormal(n, m=None):
@@ -595,24 +596,8 @@ def birnn(forward, backward):
 
 
 @torch.jit.script
-def logaddexp_fwdbwd(x, y):
-    z = torch.max(x, y) + torch.log1p(torch.exp(-torch.abs(x - y)))
-    return z, (x-z).exp(), (y-z).exp()
-
-
-class LogAddExp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, y):
-        z, xmz, ymz = logaddexp_fwdbwd(x, y)
-        ctx.save_for_backward(xmz, ymz)
-        return z
-
-    @staticmethod
-    def backward(ctx, outgrad):
-        xmz, ymz = ctx.saved_tensors
-        return outgrad * xmz, outgrad * ymz
-
-logaddexp = LogAddExp.apply
+def logaddexp(x, y):
+    return torch.max(x, y) + torch.log1p(torch.exp(-torch.abs(x - y)))
 
 
 @torch.jit.script
diff --git a/test/unit/test_layers.py b/test/unit/test_layers.py
index c92254e..954a56a 100644
--- a/test/unit/test_layers.py
+++ b/test/unit/test_layers.py
@@ -326,7 +326,7 @@ def test_cupy_and_non_cupy_same(self):
         # rtol before softmax = atol after softmax. Therefore I've replaced
         # the atol with the default value for rtol.
         print((abs(x1.grad - x2.grad)).max())
-        self.assertTrue(torch.allclose(x1.grad, x2.grad, atol=1e-05))
+        self.assertTrue(torch.allclose(x1.grad, x2.grad, atol=1e-04))
 
 
 class UpSampleTest(LayerTest, unittest.TestCase):