From 583384bbe750e02889a2c00acf3c9e1158b875dd Mon Sep 17 00:00:00 2001
From: Jon Leech <oddhack@sonic.net>
Date: Tue, 9 Apr 2024 07:18:08 -0700
Subject: [PATCH] First cut at script to cross-validate category ToC and actual
 generated refpages

Misc. minor fixes to specs and refinements of the script to follow.
---
 OpenCL_C.txt                 |   2 +-
 man/toctail                  |   2 -
 scripts/check_refpage_toc.py | 240 +++++++++++++++++++++++++++++++++++
 3 files changed, 241 insertions(+), 3 deletions(-)
 create mode 100755 scripts/check_refpage_toc.py
diff --git a/OpenCL_C.txt b/OpenCL_C.txt
index 5c27f104..99ac60fd 100644
--- a/OpenCL_C.txt
+++ b/OpenCL_C.txt
@@ -12369,7 +12369,7 @@ endif::cl_khr_mipmap_image_writes[]
 [[built-in-image-query-functions]]
 ==== Built-in Image Query Functions
 
-[open,refpage='imageQueryFunctions',desc='Built-in Image Query Functions',type='freeform',spec='clang',anchor='built-in-image-query-functions',xrefs='imageReadFunctions imageSamplerlessReadFunctions imageWriteFunctions',alias='get_image_width get_image_height get_image_depth get_image_channel_data_type get_image_channel_order get_image_dim get_image_array_size']
+[open,refpage='imageQueryFunctions',desc='Built-in Image Query Functions',type='freeform',spec='clang',anchor='built-in-image-query-functions',xrefs='imageReadFunctions imageSamplerlessReadFunctions imageWriteFunctions',alias='get_image_width get_image_height get_image_depth get_image_channel_data_type get_image_channel_order get_image_dim get_image_array_size get_image_num_samples']
 --
 
 The following built-in function calls to query image information are
diff --git a/man/toctail b/man/toctail
index 608ff791..ae2aa989 100644
--- a/man/toctail
+++ b/man/toctail
@@ -325,9 +325,7 @@
                                     <li><a href="get_image_channel_order.html" target="pagedisplay">get_image_channel_order</a></li>
                                     <li><a href="get_image_dim.html" target="pagedisplay">get_image_dim</a></li>
                                     <li><a href="get_image_array_size.html" target="pagedisplay">get_image_array_size</a></li>
-                                        <!-- Part of cl_khr_gl_msaa_sharing, not fully documented in static page
                                     <li><a href="get_image_num_samples.html" target="pagedisplay">get_image_num_samples</a></li>
-                                        -->
                                 </ul>
                             </li>
 
diff --git a/scripts/check_refpage_toc.py b/scripts/check_refpage_toc.py
new file mode 100755
index 00000000..21f18635
--- /dev/null
+++ b/scripts/check_refpage_toc.py
@@ -0,0 +1,240 @@
+#!/usr/bin/python3
+#
+# Copyright 2016-2024 The Khronos Group Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# check_refpage_toc.py - validate the hand-written refpage category index
+# against the actual generated refpages.
+#
+# Usage: check_refpage_toc.py -toctail path -refpages path
+
+import argparse
+import io
+import os
+import re
+import sys
+from reflib import loadFile
+
+#Inputs:
+#
+#man/toctail
+#    Contains the category index, which is a fragment (not valid HTML) with lines like
+#
+#        ...<a href="enums.html" target="pagedisplay">Enumerated Types</a>...
+#        ...<a href="clGetPlatformIDs.html" target="pagedisplay">clGetPlatformIDs</a>
+#
+#    This should ideally refer to all generated refpages and aliases of them.
+#
+#    With the .html removed, the link text will sometimes be the same as the
+#    page name; sometimes be an alias to the page name; and sometimes just be
+#    explanatory text.
+#
+#    We already know that some extension appendices do not appear in the
+#    category index; they do appear in the alphabetical index.
+#
+#    Unfortunately, some of the anchor text contains HTML constructs or text
+#    more complex than just a name or a sequence of words. Probably will need
+#    special handling.
+#
+#    There are the following duplicates in the link text;
+#
+#cl_image_format
+#cl_khr_d3d10_sharing
+#cl_khr_d3d11_sharing
+#cl_khr_dx9_media_sharing
+#cl_khr_gl_event
+#cl_khr_gl_sharing
+#clamp   (different targets, commonClamp / clamp_integer)
+#max     (different targets, commonMax / integerMax)
+#min     (different targets, commonMin / integerMin)
+#
+#    In general this is fine, so long as the targets are identical. The
+#    handful of different targets are also generally fine ATM and can
+#    be put on an exception list if needed.
+#
+#gen/refpage
+#    Contains the extracted and static (from man/static/) refpage source
+#    files, named e.g. 'clGetPlatformIDs.txt', and the Apache rewrites for
+#    aliases in 'rewritebody', which look like
+#
+#        RewriteRule ^CL_VERSION_1_0.html$ preprocessorDirectives.html
+#
+#    Combined, these make up the complete refpage set. We would like every
+#    page and alias to be the category index (even more ideally, they would
+#    be searchable in Antora, etc.)
+#
+#    rewritebody describes the actual aliases, and ideally should match the
+#    aliases in man/toctail.
+#
+#Comparison process:
+#    toctail - RE match to page names and descriptions.
+#    Construct dual index from page name to description(s) and vice-versa
+#    (handle aliased descriptions somehow).
+#
+#    refpages - scan directory for all .txt files and construct
+#    name -> name mapping.
+#    Scan rewritebody (or better, emit it as a Python dictionary as well) and
+#    construct alias -> name mapping.
+#    Note that .txt files are 'actual' refpages.
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-tocfile', action='store', dest='tocfile',
+                        default='man/toctail',
+                        help='Set the ToC file (default man/toctail')
+    parser.add_argument('-refpages', action='store', dest='refpages',
+                        default='generated/refpage',
+                        help='Set path to generate refpage directory (default generated/refpage)')
+
+    results = parser.parse_args()
+    errcount = 0
+
+    # Scan ToC
+    file, _ = loadFile(results.tocfile)
+    if file is None:
+        write(f'Cannot open ToC file {results.tocfile}')
+        sys.exit(1)
+
+    FIND_TOC_LINK_RE = re.compile(r'.*href="(?P<link>[^"]+)[^>]+>(?P<text>[^<]+)</a>')
+
+    # ToC links. Each entry is a set of link texts mapping to that refpage name
+    toclinks = {}
+
+    # ToC texts. Each entry is a set of refpage names described with that text
+    toctext = {}
+
+    for line in file:
+        # Extract ToC link, if present
+        match = FIND_TOC_LINK_RE.match(line)
+
+        if match is None:
+            continue
+
+        link = match.group('link')
+        text = match.group('text')
+
+        # Strip .html suffix
+        if link[-5:] != '.html':
+            print(f'{results.tocfile} contains non-.html link {link}: {line}')
+            continue
+        link = link[:-5]
+
+        #if text[-5:] != '.html':
+        #    print(f'{rewritefile} contains non-.html rewrite to {text}: {line}')
+        #    continue
+        #text = text[:-5]
+
+        if link not in toclinks:
+            toclinks[link] = set()
+        toclinks[link].add(text)
+
+        if text not in toctext:
+            toctext[text] = set()
+        toctext[text].add(link)
+
+    print(f'{len(toclinks)} links in ToC, {len(toctext)} link-texts')
+    for link, texts in toclinks.items():
+        if len(texts) > 1:
+            print(f'link {link} -> {len(texts)} distinct texts: {texts}')
+
+    for text, links in toctext.items():
+        if len(links) > 1:
+            print(f'text {text} -> {len(links)} distinct links: {links}')
+
+    # Scan refpages
+
+    # Actual pages - each entry is a set of aliases that rewrite to that page
+    refpages = {}
+
+    # Page aliases - each entry is the page name they are rewritten to
+    refaliases = {}
+
+    # Collect files in the refpage directory
+    for file in os.listdir(results.refpages):
+        if file.endswith('.txt'):
+            # Strip '.txt' leaving the API name
+            pagename = file[:-4]
+            refpages[pagename] = set()
+
+    # Collect aliases from the rewritebody file
+    rewritefile = results.refpages + '/rewritebody'
+    file, _ = loadFile(rewritefile)
+    if file is None:
+        write(f'Cannot open HTML rewrite file {rewritefile}')
+        sys.exit(1)
+
+    FIND_REWRITE_LINK_RE = re.compile(r'.*\^(?P<alias>[^$]+)\$ +(?P<page>[^ ]+)')
+
+    for line in file:
+        # Extract rewrite directive, if present
+        # Remove trailing newline
+        match = FIND_REWRITE_LINK_RE.match(line.rstrip())
+
+        if match is None:
+            continue
+
+        alias = match.group('alias')
+        page = match.group('page')
+
+        if alias[-5:] != '.html':
+            print(f'{rewritefile} contains non-.html rewrite from {alias}: {line}')
+            continue
+
+        if page[-5:] != '.html':
+            print(f'{rewritefile} contains non-.html rewrite to {page}: {line}')
+            continue
+
+        # Now strip .html suffixes
+        alias = alias[:-5]
+        page = page[:-5]
+
+        # Track alias -> page rewrites
+        if alias in refaliases:
+            print(f'{rewritefile} contains multiple rewrites of {alias}: {line}')
+            errcount += 1
+        else:
+            refaliases[alias] = page
+
+        # Track all aliases rewriting to a page
+        if page not in refpages:
+            print(f'{rewritefile} contains rewrite of {alias} to the nonexistent refpage {page}: {line}')
+            errcount += 1
+        else:
+            refpages[page].add(alias)
+
+    #####################################
+
+    # Verify that all .html files in toctail exist and tag whether they are
+    # aliases or not.
+
+    for link in sorted(toclinks.keys()):
+        if link not in refpages and link not in refaliases:
+            print(f'ToC link to {link}.html has no corresponding refpage {link}.txt, or alias to it')
+            errcount += 1
+
+    # Verify that all aliases in refpages/ appear in toctail as descriptions.
+
+    for alias in sorted(refaliases.keys()):
+        if alias not in toctext:
+            print(f'refpage alias {alias} has no link from the ToC, at least with that name')
+            errcount += 1
+
+    # Verify that all non-aliased names (.txt files) in refpages appear in
+    # toctail as .html files and descriptions.
+
+    for page in sorted(refpages.keys()):
+        if page not in toclinks:
+            print(f'refpage {page} has no link from the ToC')
+            errcount += 1
+
+    print(f'{errcount} errors found')
+
+    # Summarize link text (descriptions) that do not appear as refpage
+    # aliases. Most are probably just straight text and fine.
+    #
+    # Summarize any remaining discrepancies.
+
+    print(f'Found {len(refpages)} reference pages')
+    print(f'Found {len(refaliases)} aliases to reference pages')