Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLVM verification #356

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
355 changes: 355 additions & 0 deletions ext/auto-inst/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
import os
import json
import re
import sys
from collections import defaultdict
import yaml

REPO_INSTRUCTIONS = {}
REPO_DIRECTORY = None

def safe_get(data, key, default=""):
"""Safely get a value from a dictionary, return default if not found or error."""
try:
if isinstance(data, dict):
return data.get(key, default)
return default
except:
return default

def load_yaml_encoding(instr_name):
"""
Given an instruction name (from JSON), find the corresponding YAML file and load its encoding data.
We'll try to match the instr_name to a YAML file by using REPO_INSTRUCTIONS and transformations.
"""
candidates = set()
lower_name = instr_name.lower()
candidates.add(lower_name)
candidates.add(lower_name.replace('_', '.'))

yaml_file_path = None
yaml_category = None
for cand in candidates:
if cand in REPO_INSTRUCTIONS:
yaml_category = REPO_INSTRUCTIONS[cand]
yaml_file_path = os.path.join(REPO_DIRECTORY, yaml_category, cand + ".yaml")
if os.path.isfile(yaml_file_path):
break
else:
yaml_file_path = None

if not yaml_file_path or not os.path.isfile(yaml_file_path):
# YAML not found
return None, None

# Load the YAML file
with open(yaml_file_path, 'r') as yf:
ydata = yaml.safe_load(yf)

encoding = safe_get(ydata, 'encoding', {})
yaml_match = safe_get(encoding, 'match', None)
yaml_vars = safe_get(encoding, 'variables', [])

return yaml_match, yaml_vars

def compare_yaml_json_encoding(instr_name, yaml_match, yaml_vars, json_encoding_str):
"""
Compare the YAML encoding (match + vars) with the JSON encoding (binary format).
If the JSON has a variable like vm[?], it should be treated as just vm.

If instr_name starts with 'C_', then treat the instruction as 16 bits long.
Otherwise, treat it as 32 bits long.

Return a list of differences.
"""
if not yaml_match:
return ["No YAML match field available for comparison."]
if not json_encoding_str:
return ["No JSON encoding available for comparison."]

# Determine expected length based on whether it's a compressed instruction (C_)
expected_length = 16 if instr_name.startswith('C_') else 32

yaml_pattern_str = yaml_match.replace('-', '.')
if len(yaml_pattern_str) != expected_length:
return [f"YAML match pattern length is {len(yaml_pattern_str)}, expected {expected_length}. Cannot compare properly."]

def parse_location(loc_str):
# Ensure loc_str is a string
loc_str = str(loc_str).strip()
if '-' in loc_str:
high, low = loc_str.split('-')
return int(high), int(low)
else:
# If no dash, treat it as a single bit field
val = int(loc_str)
return val, val


yaml_var_positions = {}
for var in yaml_vars:
high, low = parse_location(var["location"])
yaml_var_positions[var["name"]] = (high, low)

# Tokenize the JSON encoding string. We assume it should match the expected_length in bits.
tokens = re.findall(r'(?:[01]|[A-Za-z0-9]+(?:\[\d+\]|\[\?\])?)', json_encoding_str)
json_bits = []
bit_index = expected_length - 1
for t in tokens:
json_bits.append((bit_index, t))
bit_index -= 1

if bit_index != -1:
return [f"JSON encoding does not appear to be {expected_length} bits. Ends at bit {bit_index+1}."]

# Normalize JSON bits (handle vm[?] etc.)
normalized_json_bits = []
for pos, tt in json_bits:
if re.match(r'vm\[[^\]]*\]', tt):
tt = 'vm'
normalized_json_bits.append((pos, tt))
json_bits = normalized_json_bits

differences = []

# Check fixed bits
for b in range(expected_length):
yaml_bit = yaml_pattern_str[expected_length - 1 - b]
token = [tt for (pos, tt) in json_bits if pos == b]
if not token:
differences.append(f"Bit {b}: No corresponding JSON bit found.")
continue
json_bit_str = token[0]

if yaml_bit in ['0', '1']:
if json_bit_str not in ['0', '1']:
differences.append(f"Bit {b}: YAML expects fixed bit '{yaml_bit}' but JSON has '{json_bit_str}'")
elif json_bit_str != yaml_bit:
differences.append(f"Bit {b}: YAML expects '{yaml_bit}' but JSON has '{json_bit_str}'")
else:
if json_bit_str in ['0', '1']:
differences.append(f"Bit {b}: YAML variable bit but JSON is fixed '{json_bit_str}'")

# Check variable fields
for var_name, (high, low) in yaml_var_positions.items():
# Ensure the variable range fits within the expected_length
if high >= expected_length or low < 0:
differences.append(f"Variable {var_name}: location {high}-{low} is out of range for {expected_length}-bit instruction.")
continue

json_var_fields = []
for bb in range(low, high+1):
token = [tt for (pos, tt) in json_bits if pos == bb]
if token:
json_var_fields.append(token[0])
else:
json_var_fields.append('?')

# Extract field names
field_names = set(re.findall(r'([A-Za-z0-9]+)(?:\[\d+\]|\[\?\])?', ' '.join(json_var_fields)))
if len(field_names) == 0:
differences.append(f"Variable {var_name}: No corresponding field found in JSON bits {high}-{low}")
elif len(field_names) > 1:
differences.append(f"Variable {var_name}: Multiple fields {field_names} found in JSON for bits {high}-{low}")

return differences

def safe_print_instruction_details(name: str, data: dict, output_stream):
"""Print formatted instruction details and compare YAML/JSON encodings."""
try:
output_stream.write(f"\n{name} Instruction Details\n")
output_stream.write("=" * 50 + "\n")

output_stream.write("\nBasic Information:\n")
output_stream.write("-" * 20 + "\n")
output_stream.write(f"Name: {name}\n")
output_stream.write(f"Assembly Format: {safe_get(data, 'AsmString', 'N/A')}\n")
output_stream.write(f"Size: {safe_get(data, 'Size', 'N/A')} bytes\n")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If an instruction doesn't have a Size, it cannot be encoded, so it's likely not of interest to checking the encoding.


locs = safe_get(data, '!locs', [])
loc = locs[0] if isinstance(locs, list) and len(locs) > 0 else "N/A"
output_stream.write(f"Location: {loc}\n")

output_stream.write("\nOperands:\n")
output_stream.write("-" * 20 + "\n")
try:
in_ops = safe_get(data, 'InOperandList', {}).get('printable', 'N/A')
output_stream.write(f"Inputs: {in_ops}\n")
except:
output_stream.write("Inputs: N/A\n")

try:
out_ops = safe_get(data, 'OutOperandList', {}).get('printable', 'N/A')
output_stream.write(f"Outputs: {out_ops}\n")
except:
output_stream.write("Outputs: N/A\n")

# Encoding
output_stream.write("\nEncoding Pattern:\n")
output_stream.write("-" * 20 + "\n")
encoding_bits = []
try:
inst = safe_get(data, 'Inst', [])
for bit in inst:
if isinstance(bit, dict):
encoding_bits.append(f"{bit.get('var', '?')}[{bit.get('index', '?')}]")
else:
encoding_bits.append(str(bit))
# Reverse the bit order before joining
encoding_bits.reverse()
lenary marked this conversation as resolved.
Show resolved Hide resolved
encoding = "".join(encoding_bits)
output_stream.write(f"JSON Encoding: {encoding}\n")
except:
output_stream.write("JSON Encoding: Unable to parse encoding\n")
encoding = ""

# compare YAML vs JSON encodings
yaml_match, yaml_vars = load_yaml_encoding(name)
if yaml_match is not None:
output_stream.write(f"YAML Encoding: {yaml_match}\n")
else:
output_stream.write("YAML Encoding: Not found\n")

if yaml_match and encoding:
# Perform comparison
differences = compare_yaml_json_encoding(name, yaml_match, yaml_vars, encoding)
if differences and len(differences) > 0:
output_stream.write("\nEncodings do not match. Differences:\n")
for d in differences:
output_stream.write(f" - {d}\n")
print(f"Difference in {name}: {d}", file=sys.stdout) # Print to console
else:
output_stream.write("\nEncodings Match: No differences found.\n")
else:
# If we have no YAML match or no JSON encoding, we note that we can't compare
output_stream.write("\nComparison: Cannot compare encodings (missing YAML or JSON encoding).\n")

output_stream.write("\n")
except Exception as e:
output_stream.write(f"Error processing instruction {name}: {str(e)}\n")
output_stream.write("Continuing with next instruction...\n\n")

def get_repo_instructions(repo_directory):
"""
Recursively find all YAML files in the repository and extract instruction names along with their category.
"""
repo_instructions = {}
for root, _, files in os.walk(repo_directory):
for file in files:
if file.endswith(".yaml"):
instr_name = os.path.splitext(file)[0]
relative_path = os.path.relpath(root, repo_directory)
repo_instructions[instr_name.lower()] = relative_path
return repo_instructions

def find_json_key(instr_name, json_data):
"""
Find a matching instruction in json_data by comparing against AsmString values.
Returns the matching key if found, None otherwise.

Args:
instr_name (str): The instruction name from YAML
json_data (dict): The JSON data containing instruction information

Returns:
str or None: The matching key from json_data if found, None otherwise
"""
# First, normalize the instruction name for comparison
instr_name = instr_name.lower().strip()

# Search through all entries in json_data
for key, value in json_data.items():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use the !instanceof metadata to only search through things with an encoding, I pointed this out in the comment about the initial approach. This will save you iterating through aliases (and other data like isel patterns)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I can see !instanceof follows this structure, but I'm finding it hard to understand how to use it for the parsing purposes, can you please further explain?

{
  "!instanceof": {
    "AES_1Arg_Intrinsic": [
      "int_arm_neon_aesimc",
      "int_arm_neon_aesmc"
    ],
    "AES_2Arg_Intrinsic": [
      "int_arm_neon_aesd",
      "int_arm_neon_aese"
    ],
    "ALUW_rr": [
      "ADDW",
      "ADD_UW",
      "DIVUW",
      "DIVW",
      "MULW",
      "PACKW",
      "REMUW",
      "REMW",
      "ROLW",
      "RORW",
      "SH1ADD_UW",
      "SH2ADD_UW",
      "SH3ADD_UW",
      "SLLW",
      "SRAW",
      "SRLW",
      "SUBW"
    ],
    "ALU_ri": [
      "ADDI",
      "ANDI",
      "ORI",
      "SLTI",
      "SLTIU",
      "XORI"
    ],

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, let's assume you parsed the whole json object, into a python variable called json.

json["!instanceof"] is a map, from tablegen classes, to a list of definition names that are instances of that class (or its sub-classes). These definition names appear in the top-level json.

You would use code like the following:

for def_name in json["!instanceof"]["RVInstCommon"]:
  def_data = json[def_name]
  ...

This saves you having to look at all the tablegen data that is not an instruction (so an alias or a pattern or CSR or something).

Note you'll still have to look at isPseudo and isCodeGenOnly and potentially exclude items where one or both of those is true.

if not isinstance(value, dict):
continue

# Get the AsmString value and normalize it
asm_string = safe_get(value, 'AsmString', '').lower().strip()
if not asm_string:
continue

# Extract the base instruction name from AsmString
# AsmString might be in format like "add $rd, $rs1, $rs2"
# We want just "add"
base_asm_name = asm_string.split()[0]

if base_asm_name == instr_name:
return key

return None

def run_parser(json_file, repo_directory, output_file="output.txt"):
"""
Run the parser logic:
1. Get instructions from the repo directory.
2. Parse the JSON file and match instructions.
3. Generate output.txt with instruction details.
"""
global REPO_INSTRUCTIONS, REPO_DIRECTORY
REPO_DIRECTORY = repo_directory

# Get instructions and categories from the repository structure
REPO_INSTRUCTIONS = get_repo_instructions(REPO_DIRECTORY)
if not REPO_INSTRUCTIONS:
print("No instructions found in the provided repository directory.")
return None

try:
# Read and parse JSON
with open(json_file, 'r') as f:
data = json.loads(f.read())
except Exception as e:
print(f"Error reading file: {str(e)}")
return None

all_instructions = []

# For each YAML instruction, try to find it in the JSON data
for yaml_instr_name, category in REPO_INSTRUCTIONS.items():
json_key = find_json_key(yaml_instr_name, data)
if json_key is None:
print(f"DEBUG: Instruction '{yaml_instr_name}' (from YAML) not found in JSON, skipping...", file=sys.stderr)
continue

instr_data = data.get(json_key)
if not isinstance(instr_data, dict):
print(f"DEBUG: Instruction '{yaml_instr_name}' is in JSON but not a valid dict, skipping...", file=sys.stderr)
continue

# Add this instruction to our list
all_instructions.append((json_key, instr_data))

# Sort all instructions by name
all_instructions.sort(key=lambda x: x[0].lower())

with open(output_file, "w") as outfile:
outfile.write("RISC-V Instruction Summary\n")
outfile.write("=" * 50 + "\n")
total = len(all_instructions)
outfile.write(f"\nTotal Instructions Found: {total}\n")
for name, _ in all_instructions:
outfile.write(f" - {name}\n")

outfile.write("\nDETAILED INSTRUCTION INFORMATION\n")
outfile.write("=" * 80 + "\n")

# Print details for each instruction directly
for name, instr_data in all_instructions:
safe_print_instruction_details(name, instr_data, outfile)

print(f"Output has been written to {output_file}")
return output_file

def main():
if len(sys.argv) != 3:
print("Usage: python riscv_parser.py <tablegen_json_file> <arch_inst_directory>")
sys.exit(1)

json_file = sys.argv[1]
repo_directory = sys.argv[2]

result = run_parser(json_file, repo_directory, output_file="output.txt")
if result is None:
sys.exit(1)

if __name__ == '__main__':
main()
Loading
Loading