Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLVM verification #356

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
328 changes: 328 additions & 0 deletions ext/auto-inst/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
import os
import json
import re
import sys
from collections import defaultdict
import yaml

REPO_INSTRUCTIONS = {}
REPO_DIRECTORY = None

def safe_get(data, key, default=""):
"""Safely get a value from a dictionary, return default if not found or error."""
try:
if isinstance(data, dict):
return data.get(key, default)
return default
except:
return default

def load_yaml_encoding(instr_name):
"""
Given an instruction name (from JSON), find the corresponding YAML file and load its encoding data.
We'll try to match the instr_name to a YAML file by using REPO_INSTRUCTIONS and transformations.
"""
candidates = set()
lower_name = instr_name.lower()
candidates.add(lower_name)
candidates.add(lower_name.replace('_', '.'))

yaml_file_path = None
yaml_category = None
for cand in candidates:
if cand in REPO_INSTRUCTIONS:
yaml_category = REPO_INSTRUCTIONS[cand]
yaml_file_path = os.path.join(REPO_DIRECTORY, yaml_category, cand + ".yaml")
if os.path.isfile(yaml_file_path):
break
else:
yaml_file_path = None

if not yaml_file_path or not os.path.isfile(yaml_file_path):
# YAML not found
return None, None

# Load the YAML file
with open(yaml_file_path, 'r') as yf:
ydata = yaml.safe_load(yf)

encoding = safe_get(ydata, 'encoding', {})
yaml_match = safe_get(encoding, 'match', None)
yaml_vars = safe_get(encoding, 'variables', [])

return yaml_match, yaml_vars

def compare_yaml_json_encoding(yaml_match, yaml_vars, json_encoding_str):
"""
Compare the YAML encoding (match + vars) with the JSON encoding (binary format).
Return a list of differences.
"""
if not yaml_match:
return ["No YAML match field available for comparison."]
if not json_encoding_str:
return ["No JSON encoding available for comparison."]

yaml_pattern_str = yaml_match.replace('-', '.')
if len(yaml_pattern_str) != 32:
return [f"YAML match pattern length is {len(yaml_pattern_str)}, expected 32. Cannot compare properly."]

def parse_location(loc_str):
high, low = loc_str.split('-')
return int(high), int(low)

yaml_var_positions = {}
for var in yaml_vars:
high, low = parse_location(var["location"])
yaml_var_positions[var["name"]] = (high, low)

# Tokenize JSON encoding
tokens = re.findall(r'(?:[01]|[A-Za-z0-9]+(?:\[\d+\])?)', json_encoding_str)
json_bits = []
bit_index = 31
for t in tokens:
json_bits.append((bit_index, t))
bit_index -= 1

if bit_index != -1:
return [f"JSON encoding does not appear to be 32 bits. Ends at bit {bit_index+1}."]

differences = []

# Check fixed bits
for b in range(32):
yaml_bit = yaml_pattern_str[31 - b]
token = [tt for (pos, tt) in json_bits if pos == b]
if not token:
differences.append(f"Bit {b}: No corresponding JSON bit found.")
continue
json_bit_str = token[0]

if yaml_bit in ['0', '1']:
if json_bit_str not in ['0', '1']:
differences.append(f"Bit {b}: YAML expects fixed bit '{yaml_bit}' but JSON has '{json_bit_str}'")
elif json_bit_str != yaml_bit:
differences.append(f"Bit {b}: YAML expects '{yaml_bit}' but JSON has '{json_bit_str}'")
else:
# Variable bit in YAML
if json_bit_str in ['0', '1']:
differences.append(f"Bit {b}: YAML variable bit but JSON is fixed '{json_bit_str}'")

# Check variable fields
for var_name, (high, low) in yaml_var_positions.items():
json_var_fields = []
for bb in range(low, high+1):
token = [tt for (pos, tt) in json_bits if pos == bb]
if token:
json_var_fields.append(token[0])
else:
json_var_fields.append('?')

field_names = set(re.findall(r'([A-Za-z0-9]+)\[\d+\]', ' '.join(json_var_fields)))
if len(field_names) == 0:
differences.append(f"Variable {var_name}: No corresponding field found in JSON bits {high}-{low}")
elif len(field_names) > 1:
differences.append(f"Variable {var_name}: Multiple fields {field_names} found in JSON for bits {high}-{low}")

return differences

def safe_print_instruction_details(name: str, data: dict, output_stream):
"""Print formatted instruction details and compare YAML/JSON encodings."""
try:
# Print the instruction details without separating by category
output_stream.write(f"\n{name} Instruction Details\n")
output_stream.write("=" * 50 + "\n")

# Basic Information
output_stream.write("\nBasic Information:\n")
output_stream.write("-" * 20 + "\n")
output_stream.write(f"Name: {name}\n")
output_stream.write(f"Assembly Format: {safe_get(data, 'AsmString', 'N/A')}\n")
output_stream.write(f"Size: {safe_get(data, 'Size', 'N/A')} bytes\n")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If an instruction doesn't have a Size, it cannot be encoded, so it's likely not of interest to checking the encoding.


# Location
locs = safe_get(data, '!locs', [])
loc = locs[0] if isinstance(locs, list) and len(locs) > 0 else "N/A"
output_stream.write(f"Location: {loc}\n")

# Operands
output_stream.write("\nOperands:\n")
output_stream.write("-" * 20 + "\n")
try:
in_ops = safe_get(data, 'InOperandList', {}).get('printable', 'N/A')
output_stream.write(f"Inputs: {in_ops}\n")
except:
output_stream.write("Inputs: N/A\n")

try:
out_ops = safe_get(data, 'OutOperandList', {}).get('printable', 'N/A')
output_stream.write(f"Outputs: {out_ops}\n")
except:
output_stream.write("Outputs: N/A\n")

# Instruction Properties
output_stream.write("\nInstruction Properties:\n")
output_stream.write("-" * 20 + "\n")
output_stream.write(f"Commutable: {'Yes' if safe_get(data, 'isCommutable', 0) else 'No'}\n")
output_stream.write(f"Memory Load: {'Yes' if safe_get(data, 'mayLoad', 0) else 'No'}\n")
output_stream.write(f"Memory Store: {'Yes' if safe_get(data, 'mayStore', 0) else 'No'}\n")
output_stream.write(f"Side Effects: {'Yes' if safe_get(data, 'hasSideEffects', 0) else 'No'}\n")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At some point we probably need a good discussion about these and how we verify them.

Broadly:

  • isCommutable will only be set if we can teach LLVM how to commute the instruction - using a hook implemented in C++. We do this quite a lot for e.g. inverting conditions. This is not something the assembler will do, it's done earlier, in code generation though.
  • mayLoad and mayStore should be accurate enough, but I guess you'll need to statically analyse the pseudocode to work out if a load or store happens. We really only model loads/stores to conventional memory (and not, e.g., loads for page table walks or permission checks). I'm not sure there's any modelling of ordering. These are used to prevent code motion during code generation.
  • hasSideEffects is a catch all for "be very careful with this instruction during codegen", and usually points to other effects that LLVM doesn't model. Generic CSR accesses are part of this (except the floating point CSR, which I think we model correctly), but so are other effects I haven't thought hard about.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think those should be explained in the UDB as well?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this probably needs a longer discussion on how denormalised this information should be. i.e., the information is there if you statically-analyse the pseudocode (which should absolutely be something we should be able to do with the pseudocode), but we probably don't want that to be the only way to find out this sort of thing as the pseudocode operations might be a large amount of code.

I don't think you should be exactly matching LLVM's internal representation, but I do think there is the opportunity to denormalise more information that might be generally useful for this kind of tool.

Copy link
Collaborator Author

@AFOliveira AFOliveira Dec 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that this needs a more appropriate discussion, l think @dhower-qc has been working on instruction representations lately, so he probably also has been thinking about this.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have time for this discussion before mid-January.

Let's try get the things that are already in the YAML done first, i.e.:

  • Instruction Encodings
  • Extensions and Profiles


# Scheduling Info
sched = safe_get(data, 'SchedRW', [])
if sched:
output_stream.write("\nScheduling Information:\n")
output_stream.write("-" * 20 + "\n")
output_stream.write("Operations:\n")
try:
for op in sched:
if isinstance(op, dict):
output_stream.write(f" - {op.get('printable', 'N/A')}\n")
except:
output_stream.write(" - Unable to parse scheduling information\n")

# Encoding
output_stream.write("\nEncoding Pattern:\n")
output_stream.write("-" * 20 + "\n")
encoding_bits = []
try:
inst = safe_get(data, 'Inst', [])
for bit in inst:
if isinstance(bit, dict):
encoding_bits.append(f"{bit.get('var', '?')}[{bit.get('index', '?')}]")
else:
encoding_bits.append(str(bit))
# Reverse the bit order before joining
encoding_bits.reverse()
lenary marked this conversation as resolved.
Show resolved Hide resolved
encoding = "".join(encoding_bits)
output_stream.write(f"Binary Format: {encoding}\n")
except:
output_stream.write("Binary Format: Unable to parse encoding\n")
encoding = ""

# Now compare YAML vs JSON encodings
yaml_match, yaml_vars = load_yaml_encoding(name)
if yaml_match is not None and encoding:
differences = compare_yaml_json_encoding(yaml_match, yaml_vars, encoding)
if differences:
output_stream.write("\nDifferences in encoding:\n")
for d in differences:
output_stream.write(f" - {d}\n")
print(f"Difference in {name}: {d}", file=sys.stdout) # Print to console
else:
output_stream.write("\nNo encoding differences found.\n")
else:
# If we have no YAML match or no encoding, we note that we can't compare
if yaml_match is None:
output_stream.write("\nNo YAML encoding match found for comparison.\n")
if not encoding:
output_stream.write("\nNo JSON encoding found for comparison.\n")

output_stream.write("\n")
except Exception as e:
output_stream.write(f"Error processing instruction {name}: {str(e)}\n")
output_stream.write("Continuing with next instruction...\n\n")

def get_repo_instructions(repo_directory):
"""
Recursively find all YAML files in the repository and extract instruction names along with their category.
"""
repo_instructions = {}
for root, _, files in os.walk(repo_directory):
rel_path = os.path.relpath(root, repo_directory)
if rel_path == '.':
category = "Other"
else:
parts = rel_path.split(os.sep)
category = parts[0] if parts else "Other"

for file in files:
if file.endswith(".yaml"):
instr_name = os.path.splitext(file)[0]
# Store lowercase key for easy lookup
repo_instructions[instr_name.lower()] = category
return repo_instructions

def find_json_key(instr_name, json_data):
"""
Attempt to find a matching key in json_data for instr_name, considering different
naming conventions: replacing '.' with '_', and trying various case transformations.
"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really suggest using the name from AsmString rather than this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I'll change that, thanks for the suggestion!

lower_name = instr_name.lower()
lower_name_underscore = lower_name.replace('.', '_')
variants = {
lower_name,
lower_name_underscore,
instr_name.upper(),
instr_name.replace('.', '_').upper(),
instr_name.capitalize(),
instr_name.replace('.', '_').capitalize()
}

for v in variants:
if v in json_data:
return v
return None

def main():
global REPO_INSTRUCTIONS, REPO_DIRECTORY

if len(sys.argv) != 3:
print("Usage: python riscv_parser.py <tablegen_json_file> <arch_inst_directory>")
sys.exit(1)

json_file = sys.argv[1]
REPO_DIRECTORY = sys.argv[2]

# Get instructions and categories from the repository structure
REPO_INSTRUCTIONS = get_repo_instructions(REPO_DIRECTORY)
if not REPO_INSTRUCTIONS:
print("No instructions found in the provided repository directory.")
sys.exit(1)

try:
# Read and parse JSON
with open(json_file, 'r') as f:
data = json.loads(f.read())
except Exception as e:
print(f"Error reading file: {str(e)}")
sys.exit(1)

all_instructions = []

# For each YAML instruction, try to find it in the JSON data
for yaml_instr_name, category in REPO_INSTRUCTIONS.items():
json_key = find_json_key(yaml_instr_name, data)
if json_key is None:
print(f"DEBUG: Instruction '{yaml_instr_name}' (from YAML) not found in JSON, skipping...", file=sys.stderr)
continue

instr_data = data.get(json_key)
if not isinstance(instr_data, dict):
print(f"DEBUG: Instruction '{yaml_instr_name}' is in JSON but not a valid dict, skipping...", file=sys.stderr)
continue

# Add this instruction to our list
all_instructions.append((json_key, instr_data))

# Sort all instructions by name
all_instructions.sort(key=lambda x: x[0].lower())

with open("output.txt", "w") as outfile:
outfile.write("RISC-V Instruction Summary\n")
outfile.write("=" * 50 + "\n")
total = len(all_instructions)
outfile.write(f"\nTotal Instructions Found: {total}\n")
for name, _ in all_instructions:
outfile.write(f" - {name}\n")

outfile.write("\nDETAILED INSTRUCTION INFORMATION\n")
outfile.write("=" * 80 + "\n")

# Print details for each instruction directly, no category splitting
for name, instr_data in all_instructions:
safe_print_instruction_details(name, instr_data, outfile)

print("Output has been written to output.txt")

if __name__ == '__main__':
main()
Loading