Skip to content

Commit

Permalink
First cut at script to cross-validate category ToC and actual generat…
Browse files Browse the repository at this point in the history
…ed refpages

Misc. minor fixes to specs and refinements of the script to follow.
  • Loading branch information
oddhack committed Apr 9, 2024
1 parent 26712b2 commit 583384b
Show file tree
Hide file tree
Showing 3 changed files with 241 additions and 3 deletions.
2 changes: 1 addition & 1 deletion OpenCL_C.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12369,7 +12369,7 @@ endif::cl_khr_mipmap_image_writes[]
[[built-in-image-query-functions]]
==== Built-in Image Query Functions

[open,refpage='imageQueryFunctions',desc='Built-in Image Query Functions',type='freeform',spec='clang',anchor='built-in-image-query-functions',xrefs='imageReadFunctions imageSamplerlessReadFunctions imageWriteFunctions',alias='get_image_width get_image_height get_image_depth get_image_channel_data_type get_image_channel_order get_image_dim get_image_array_size']
[open,refpage='imageQueryFunctions',desc='Built-in Image Query Functions',type='freeform',spec='clang',anchor='built-in-image-query-functions',xrefs='imageReadFunctions imageSamplerlessReadFunctions imageWriteFunctions',alias='get_image_width get_image_height get_image_depth get_image_channel_data_type get_image_channel_order get_image_dim get_image_array_size get_image_num_samples']
--

The following built-in function calls to query image information are
Expand Down
2 changes: 0 additions & 2 deletions man/toctail
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,7 @@
<li><a href="get_image_channel_order.html" target="pagedisplay">get_image_channel_order</a></li>
<li><a href="get_image_dim.html" target="pagedisplay">get_image_dim</a></li>
<li><a href="get_image_array_size.html" target="pagedisplay">get_image_array_size</a></li>
<!-- Part of cl_khr_gl_msaa_sharing, not fully documented in static page
<li><a href="get_image_num_samples.html" target="pagedisplay">get_image_num_samples</a></li>
-->
</ul>
</li>

Expand Down
240 changes: 240 additions & 0 deletions scripts/check_refpage_toc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
#!/usr/bin/python3
#
# Copyright 2016-2024 The Khronos Group Inc.
#
# SPDX-License-Identifier: Apache-2.0

# check_refpage_toc.py - validate the hand-written refpage category index
# against the actual generated refpages.
#
# Usage: check_refpage_toc.py -toctail path -refpages path

import argparse
import io
import os
import re
import sys
from reflib import loadFile

#Inputs:
#
#man/toctail
# Contains the category index, which is a fragment (not valid HTML) with lines like
#
# ...<a href="enums.html" target="pagedisplay">Enumerated Types</a>...
# ...<a href="clGetPlatformIDs.html" target="pagedisplay">clGetPlatformIDs</a>
#
# This should ideally refer to all generated refpages and aliases of them.
#
# With the .html removed, the link text will sometimes be the same as the
# page name; sometimes be an alias to the page name; and sometimes just be
# explanatory text.
#
# We already know that some extension appendices do not appear in the
# category index; they do appear in the alphabetical index.
#
# Unfortunately, some of the anchor text contains HTML constructs or text
# more complex than just a name or a sequence of words. Probably will need
# special handling.
#
# There are the following duplicates in the link text;
#
#cl_image_format
#cl_khr_d3d10_sharing
#cl_khr_d3d11_sharing
#cl_khr_dx9_media_sharing
#cl_khr_gl_event
#cl_khr_gl_sharing
#clamp (different targets, commonClamp / clamp_integer)
#max (different targets, commonMax / integerMax)
#min (different targets, commonMin / integerMin)
#
# In general this is fine, so long as the targets are identical. The
# handful of different targets are also generally fine ATM and can
# be put on an exception list if needed.
#
#gen/refpage
# Contains the extracted and static (from man/static/) refpage source
# files, named e.g. 'clGetPlatformIDs.txt', and the Apache rewrites for
# aliases in 'rewritebody', which look like
#
# RewriteRule ^CL_VERSION_1_0.html$ preprocessorDirectives.html
#
# Combined, these make up the complete refpage set. We would like every
# page and alias to be the category index (even more ideally, they would
# be searchable in Antora, etc.)
#
# rewritebody describes the actual aliases, and ideally should match the
# aliases in man/toctail.
#
#Comparison process:
# toctail - RE match to page names and descriptions.
# Construct dual index from page name to description(s) and vice-versa
# (handle aliased descriptions somehow).
#
# refpages - scan directory for all .txt files and construct
# name -> name mapping.
# Scan rewritebody (or better, emit it as a Python dictionary as well) and
# construct alias -> name mapping.
# Note that .txt files are 'actual' refpages.

if __name__ == '__main__':
parser = argparse.ArgumentParser()

parser.add_argument('-tocfile', action='store', dest='tocfile',
default='man/toctail',
help='Set the ToC file (default man/toctail')
parser.add_argument('-refpages', action='store', dest='refpages',
default='generated/refpage',
help='Set path to generate refpage directory (default generated/refpage)')

results = parser.parse_args()
errcount = 0

# Scan ToC
file, _ = loadFile(results.tocfile)
if file is None:
write(f'Cannot open ToC file {results.tocfile}')
sys.exit(1)

FIND_TOC_LINK_RE = re.compile(r'.*href="(?P<link>[^"]+)[^>]+>(?P<text>[^<]+)</a>')

# ToC links. Each entry is a set of link texts mapping to that refpage name
toclinks = {}

# ToC texts. Each entry is a set of refpage names described with that text
toctext = {}

for line in file:
# Extract ToC link, if present
match = FIND_TOC_LINK_RE.match(line)

if match is None:
continue

link = match.group('link')
text = match.group('text')

# Strip .html suffix
if link[-5:] != '.html':
print(f'{results.tocfile} contains non-.html link {link}: {line}')
continue
link = link[:-5]

#if text[-5:] != '.html':
# print(f'{rewritefile} contains non-.html rewrite to {text}: {line}')
# continue
#text = text[:-5]

if link not in toclinks:
toclinks[link] = set()
toclinks[link].add(text)

if text not in toctext:
toctext[text] = set()
toctext[text].add(link)

print(f'{len(toclinks)} links in ToC, {len(toctext)} link-texts')
for link, texts in toclinks.items():
if len(texts) > 1:
print(f'link {link} -> {len(texts)} distinct texts: {texts}')

for text, links in toctext.items():
if len(links) > 1:
print(f'text {text} -> {len(links)} distinct links: {links}')

# Scan refpages

# Actual pages - each entry is a set of aliases that rewrite to that page
refpages = {}

# Page aliases - each entry is the page name they are rewritten to
refaliases = {}

# Collect files in the refpage directory
for file in os.listdir(results.refpages):
if file.endswith('.txt'):
# Strip '.txt' leaving the API name
pagename = file[:-4]
refpages[pagename] = set()

# Collect aliases from the rewritebody file
rewritefile = results.refpages + '/rewritebody'
file, _ = loadFile(rewritefile)
if file is None:
write(f'Cannot open HTML rewrite file {rewritefile}')
sys.exit(1)

FIND_REWRITE_LINK_RE = re.compile(r'.*\^(?P<alias>[^$]+)\$ +(?P<page>[^ ]+)')

for line in file:
# Extract rewrite directive, if present
# Remove trailing newline
match = FIND_REWRITE_LINK_RE.match(line.rstrip())

if match is None:
continue

alias = match.group('alias')
page = match.group('page')

if alias[-5:] != '.html':
print(f'{rewritefile} contains non-.html rewrite from {alias}: {line}')
continue

if page[-5:] != '.html':
print(f'{rewritefile} contains non-.html rewrite to {page}: {line}')
continue

# Now strip .html suffixes
alias = alias[:-5]
page = page[:-5]

# Track alias -> page rewrites
if alias in refaliases:
print(f'{rewritefile} contains multiple rewrites of {alias}: {line}')
errcount += 1
else:
refaliases[alias] = page

# Track all aliases rewriting to a page
if page not in refpages:
print(f'{rewritefile} contains rewrite of {alias} to the nonexistent refpage {page}: {line}')
errcount += 1
else:
refpages[page].add(alias)

#####################################

# Verify that all .html files in toctail exist and tag whether they are
# aliases or not.

for link in sorted(toclinks.keys()):
if link not in refpages and link not in refaliases:
print(f'ToC link to {link}.html has no corresponding refpage {link}.txt, or alias to it')
errcount += 1

# Verify that all aliases in refpages/ appear in toctail as descriptions.

for alias in sorted(refaliases.keys()):
if alias not in toctext:
print(f'refpage alias {alias} has no link from the ToC, at least with that name')
errcount += 1

# Verify that all non-aliased names (.txt files) in refpages appear in
# toctail as .html files and descriptions.

for page in sorted(refpages.keys()):
if page not in toclinks:
print(f'refpage {page} has no link from the ToC')
errcount += 1

print(f'{errcount} errors found')

# Summarize link text (descriptions) that do not appear as refpage
# aliases. Most are probably just straight text and fine.
#
# Summarize any remaining discrepancies.

print(f'Found {len(refpages)} reference pages')
print(f'Found {len(refaliases)} aliases to reference pages')

0 comments on commit 583384b

Please sign in to comment.