From 94c671af4d43dd7e6544d4ab4b09caf3837087ec Mon Sep 17 00:00:00 2001
From: Mark Newman <mark@trinetteandmark.com>
Date: Mon, 4 Jan 2021 17:11:36 -0500
Subject: [PATCH] basic QA checks

---
 README.md                |  8 +++-
 code/qa_speeches_text.py | 79 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 code/qa_speeches_text.py

diff --git a/README.md b/README.md
index e5a891b..5b9c922 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,13 @@ The pathing can be changed to any desired location.
    ```{ps1}
    python convert_speeches_text.py -in d:/datasets/who/raw -out d:/datasets/who/corpus.jsonl
    ```
-6. [tokenize_speeches_text](./code/tokenize_speeches_text.py).
+6. [qa_speeches_text](./code/qa_speeches_text.py).
+   This script runs a simple QA check on the data.
+   The script can be run more than once.
+   ```{ps1}
+   python qa_speeches_text.py -raw d:/datasets/who/raw -jsonl d:/datasets/who/corpus.jsonl -out d:/datasets/who/qa.csv
+   ```
+7. [tokenize_speeches_text](./code/tokenize_speeches_text.py).
    This script will tokenize the raw speech text, converting one paragraph per line to one sentence per line.
    Additional cleanup (i.e. ` to ') will also be performed.
    The script can be run more than once.
diff --git a/code/qa_speeches_text.py b/code/qa_speeches_text.py
new file mode 100644
index 0000000..5efa251
--- /dev/null
+++ b/code/qa_speeches_text.py
@@ -0,0 +1,79 @@
+import csv
+import pathlib
+import jsonlines as jl
+import progressbar as pb
+import typing as t
+from argparse import ArgumentParser
+from typeguard import typechecked
+
+@typechecked
+def qa_speeches_text(raw_folder: pathlib.Path, jsonl_file: pathlib.Path, file_out: pathlib.Path) -> None:
+    """
+    Runs a simple QA check on the converted speeches
+
+    Parameters
+    ----------
+    raw_folder : pathlib.Path
+        Folder to contain the downloaded documents
+    jsonl_file : pathlib.Path
+        File containing the speeches text
+    file_out : pathlib.Path
+        File containing the QA results
+    """
+
+    if not raw_folder.exists():
+        raise RuntimeError(f'Could not find the HTML folder: {raw_folder}')
+    if not jsonl_file.exists():
+        raise RuntimeError(f'Could not find the JSONL file: {jsonl_file}')
+    if file_out.exists():
+        file_out.unlink()
+
+    speech = 1
+    widgets = [ 'QA Speech # ', pb.Counter(), ' ', pb.BouncingBar(marker = '.', left = '[', right = ']'), ' ', pb.Timer()]
+    with pb.ProgressBar(widgets = widgets) as bar:
+        with open(jsonl_file, 'r', encoding = 'utf-8') as fpr:
+            with jl.Reader(fpr) as reader:
+                with open(file_out, 'w', encoding = 'utf-8', newline = '') as fpw:            
+                    writer = csv.writer(fpw, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_ALL)
+                    writer.writerow(['id', 'raw_count', 'converted_count'])
+                    for json in reader:
+                        bar.update(speech)
+                        speech = speech + 1
+                        (full, converted) = _process_document(raw_folder, json)
+                        writer.writerow([json['id'], full, converted])
+
+@typechecked
+def _process_document(raw_folder: pathlib.Path, json: dict) -> t.Tuple[int, int]:
+    """
+    Calculates basic measures to help understand if the process worked as expected
+    """
+    
+    file_in = raw_folder.joinpath(json['id'])
+    with open(file_in, 'r', encoding = 'utf-8') as fp:
+        full_size = sum([len(line) for line in fp.readlines()])    
+    converted_size = sum([len(line) for line in json['text']])
+
+    return full_size, converted_size
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument(
+        '-raw', '--raw-folder',
+        help = 'Folder to contain the downloaded documents',
+        type = pathlib.Path,
+        required = True)
+    parser.add_argument(
+        '-jsonl', '--jsonl-file',
+        help = 'File containing the speeches'' text',
+        type = pathlib.Path,
+        required = True)
+    parser.add_argument(
+        '-out', '--file-out',
+        help = 'File containing the QA results',
+        type = pathlib.Path,
+        required = True)
+    args = parser.parse_args()
+    print(f'raw folder in: {args.raw_folder}')
+    print(f'jsonl file in: {args.jsonl_file}')
+    print(f'qa file out: {args.file_out}')
+    qa_speeches_text(args.raw_folder, args.jsonl_file, args.file_out)