diff --git a/test/README.md b/test/README.md
index a036e7c..a561004 100644
--- a/test/README.md
+++ b/test/README.md
@@ -1,11 +1,13 @@
 # Test
 
-This directory contains test cases that are mostly meant for internal testing. However, feel free to have a look if you are interested.
+This directory contains test cases that are mostly meant for internal testing. However, feel free to have a look if you are interested. The created test data can for instance be useful as a standardized test for evaluating other tools.
 
-We here run grenedalf as well as our minimalistic independent implementation of the equations in Python, and compare the two to each other. As they both yield the same results, we do have some confidence that the equations are correctly implemented. The tests span a range of pool sizes, read depths, allele frequencies, and window sizes. However, this is a minimalistic test with clean data, i.e., we assume sufficient read depth and no missing data in the test. Hence, if you encounter any issues on more realistic data, please report the [issue](https://github.com/lczech/grenedalf/issues).
+We here run grenedalf as well as our minimalistic independent implementation of the equations in Python, and compare the two to each other. As they both yield the same results, we gain some confidence that the equations are correctly implemented. The tests span a range of pool sizes, read depths, allele frequencies, and window sizes. However, this is a minimalistic test with clean data, i.e., we assume sufficient read depth and no missing data in the test. Hence, if you encounter any issues on more realistic data, please report the [issue](https://github.com/lczech/grenedalf/issues).
 
 The Python dependencies for running the tests are listed in `conda.yaml` and `pip.txt`, for the two package manages. Use these to install the packages needed to run the tests. We also run these tests in our [CI Tests](https://github.com/lczech/grenedalf/actions) in GitHub Actions.
 
+To run the tests locally, make sure all Python dependencies are met. Then, the `execute_tests.py` script runs all 960 test cases, and the `evaluate.py` script checks that both implementations (grenedalf and the independent minimalistic Python implementation) yield the same results, as well as plots these against each other, for all estimators that we are interested in.
+
 By default, we do not test PoPoolation here, as that's not the scope of this test. We however have implemented this for completeness as well; if you want to run the test for PoPoolation, find the commented line containing `run_popoolation` at the very end of the `execute_test.py` script, and un-comment it. Furthermore, download the source code of [PoPoolation](https://sourceforge.net/projects/popoolation/) and [PoPoolation2](https://sourceforge.net/projects/popoolation2/), and place both as sub-directories in `test/popoolation`, named `popoolation` and `popoolation2`, respectively.
 
 The test scripts are also published in the supporting grenedalf manuscript repository, see [here](https://github.com/lczech/grenedalf-paper/tree/master/eval-independent-test). That repository contains the scripts for all tests and benchmarks that we ran for the manuscript, in particular for assessing correctness and performance of grenedalf.
diff --git a/test/clean.sh b/test/clean.sh
index f683327..3ff32dd 100755
--- a/test/clean.sh
+++ b/test/clean.sh
@@ -14,7 +14,7 @@ rm -r popoolation/fst
 rm -r popoolation/logs
 
 # Remove the test result files
-rm -r test_results.tsv
+rm -r test_results.csv
 rm -r figures_*
 
 # Also remove unnecessary py stuff
diff --git a/test/evaluate.py b/test/evaluate.py
index 0c8d33f..deeb286 100755
--- a/test/evaluate.py
+++ b/test/evaluate.py
@@ -1,9 +1,12 @@
 #!/usr/bin/env python3
 
-# libraries
+# Matplotlib shenennigans...
+# https://stackoverflow.com/a/71511579/4184258
 import matplotlib
-matplotlib.use('TkAgg')
+matplotlib.use('Agg')
+# matplotlib.use('TkAgg')
 
+# libraries
 import matplotlib.pyplot as plt
 import matplotlib.mlab as mlab
 import numpy as np
@@ -22,7 +25,7 @@
 out_dir_pdf = "figures_pdf"
 out_dir_svg = "figures_svg"
 
-infile = "test_results.tsv"
+infile = "test_results.csv"
 df = pd.read_csv(infile, sep='\t')
 
 # ------------------------------------------------------------
diff --git a/test/execute_tests.py b/test/execute_tests.py
index 8e3fefa..f43d692 100755
--- a/test/execute_tests.py
+++ b/test/execute_tests.py
@@ -21,7 +21,7 @@
 # ------------------------------------------------------------
 
 # File to write to
-outtable = "test_results.tsv"
+outtable = "test_results.csv"
 
 # We here set up the param space, following the notation of independent_check.py
 
diff --git a/test/run.sh b/test/run.sh
index 412437a..0ea45da 100755
--- a/test/run.sh
+++ b/test/run.sh
@@ -3,14 +3,14 @@
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 
 echo "Running execute_tests.py"
-python3 ${SCRIPT_DIR}/execute_tests.py
+${SCRIPT_DIR}/execute_tests.py
 if [ $? -ne 0 ]; then
     echo "FAIL"
     exit 1
 fi
 
 echo "Running evaluate.py"
-python3 ${SCRIPT_DIR}/evaluate.py
+${SCRIPT_DIR}/evaluate.py
 if [ $? -ne 0 ]; then
     echo "FAIL"
     exit 1