From 105b6bdf5294b35a4da341438adabdecaf71d359 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Thu, 21 Mar 2024 17:37:41 +0100 Subject: [PATCH] adds cleaning script --- utils/scripts/clean_listings_specfem.py | 486 ++++++++++++++++++++++++ 1 file changed, 486 insertions(+) create mode 100755 utils/scripts/clean_listings_specfem.py diff --git a/utils/scripts/clean_listings_specfem.py b/utils/scripts/clean_listings_specfem.py new file mode 100755 index 000000000..8f2680b76 --- /dev/null +++ b/utils/scripts/clean_listings_specfem.py @@ -0,0 +1,486 @@ +#!/usr/bin/env python +# +# fortran source code cleaning script +# +from __future__ import print_function + +import sys +import os +import filecmp +import shutil +import re +import subprocess +import glob +import fnmatch + +############################################################################################ + +# shows a diff between original and new content (if any) +show_diff = True + +# replaces original with new content (if anything changed) +replace_file_content = False + +############################################################################################ + +# Define the list of file extensions to be processed for Fortran formatting +fortran_file_extensions = ['.fh', '.f90', '.F90', '.fh.in'] + +# Define the list of file extensions to be processed as general formatting +general_file_extensions = ['.bash', '.c', '.cpp','.csh','.cu','.h','.h.in','.pl','.tex','.txt','.sh','.rb', '.md'] + +# Define the list of directories to be excluded (these are mostly submodules included in the source repositories) +exclude_dirs = ['.git', 'm4', './utils/ADJOINT_TOMOGRAPHY_TOOLS/flexwin', './src/inverse_problem_for_source/pyCMT3D'] + +# Define the list of files to be excluded +exclude_files = ['*Par_file*'] + +# list of regex patterns to be replaced +patterns = [ + # suppress trailing white spaces and carriage return + (r'\s*$', ''), + # use new syntax of comparison operators, ignoring case in starting pattern (useful in case of mixed case) + (r'\.le\.', '<='), + (r'\.ge\.', '>='), + (r'\.lt\.', '<'), + (r'\.gt\.', '>'), + (r'\.ne\.', '/='), + (r'\.eq\.', '=='), + # switch to lowercase for comparison operators + (r'\.and\.', '.and.'), + (r'\.or\.', '.or.'), + (r'\.not\.', '.not.'), + (r'\.eqv\.', '.eqv.'), + (r'\.neqv\.', '.neqv.'), + (r'\.true\.', '.true.'), + (r'\.false\.', '.false.'), + # switch to Fortran2008 standard + (r'call\s*getarg\(', 'call get_command_argument('), + # constant strings + (r'endsubroutine', 'end subroutine'), + (r'if\s*\(', 'if ('), + (r'\)\s*then', ') then'), + (r'end\s*if', 'endif'), + (r'end\s*do', 'enddo'), + (r'else\s*if', 'else if'), + # force lowercase keywords + (r'subroutine', 'subroutine'), + (r'end\s*subroutine', 'end subroutine'), + (r'function', 'function'), + (r'end\s*function', 'end function'), + (r'continue', 'continue'), + (r'implicit none', 'implicit none'), + (r'implicit', 'implicit'), + (r'return', 'return'), + (r' go\s*to ', ' goto '), + (r'use\s*::\s*mpi', 'use mpi'), + (r',\s*only\s*:\s*', ', only: '), + (r'NOISE_SOURCE_TIME_FUNCTION_TYPE', 'noise_source_time_function_type'), + # do not move this before the above line in which we change the keyword "function" + (r'use_ricker_time_function', 'USE_RICKER_TIME_FUNCTION'), + (r'print_source_time_function', 'PRINT_SOURCE_TIME_FUNCTION'), + (r'external_source_time_function', 'EXTERNAL_SOURCE_TIME_FUNCTION'), + (r'sourceTimeFunction', 'sourceTimeFunction'), + (r'external_stf', 'EXTERNAL_SOURCE_TIME_FUNCTION'), + (r'EXTERNAL_SOURCE_TIME_FUNCTION_filename', 'external_source_time_function_filename'), + (r'read_EXTERNAL_SOURCE_TIME_FUNCTION', 'read_external_source_time_function'), + (r'USE_MAP_function', 'USE_MAP_FUNCTION'), + (r'enddo_LOOP_IJK', 'ENDDO_LOOP_IJK'), + (r'enddo_LOOP_IJ', 'ENDDO_LOOP_IJ'), + (r'OMP do', 'OMP DO'), + (r'OMP enddo', 'OMP ENDDO'), + (r'print\*', 'print *'), + (r'print\s*\*', 'print *'), + (r'spectral-elements', 'spectral elements'), + (r'gaussian', 'Gaussian'), + (r'hessian', 'Hessian'), + (r'cartesian', 'Cartesian'), + # suppress space between parenthesis and .not. (this can happen when testing logical operators) + (r'\( \.not\. ', '(.not. '), + (r'\)call', ') call'), + # enforce upper case + (r'CUSTOM_REAL', 'CUSTOM_REAL'), + # do not use null strings, which are not part of the Fortran standard (and the IBM xlf compiler rejects them for instance) + (r'print\s*\*\s*,\s*\'\'', 'print *'), + (r'write\s*\(\s*\*\s*,\s*\*\s*\)\s*\'\'', 'print *'), + (r'write\s*\(\s*IMAIN\s*,\s*\*\s*\)\s*\'\'', 'write(IMAIN,*)'), + (r'write\s*\(\s*IOUT\s*,\s*\*\s*\)\s*\'\'', 'write(IOUT,*)'), + (r'print\s*\*\s*,\s*""', 'print *'), + (r'write\s*\(\s*\*\s*,\s*\*\s*\)\s*""', 'print *'), + (r'write\s*\(\s*IMAIN\s*,\s*\*\s*\)\s*""', 'write(IMAIN,*)'), + (r'write\s*\(\s*IOUT\s*,\s*\*\s*\)\s*""', 'write(IOUT,*)'), + # unit 6 means standard output, replace it with standard output symbol + (r'write\s*\(\s*6\s*,\s*\*\s*\)', 'write(*,*)'), + (r'write\s*\(\s*6\s*,', 'write(*,'), + # force space in , & at end of line + (r'\s*\,\s*&\s*$', ', &'), + # always use upper case for GLL when used as a word + (r' gll ', ' GLL '), + (r' mpi ', ' MPI '), + (r' pml ', ' PML '), + # fix some typos I have found in the different codes, or non-US spelling. + # also switch to US spelling in order to have the same standard in all files. + (r'regularisation', 'regularization'), + (r'optimisation', 'optimization'), + (r'analitical', 'analytical'), + # (r'communIcation', 'communication'), + (r' in orfer ', ' in order '), + (r' stepest ', ' steepest '), + (r' stepest$', ' steepest'), + (r'aloow', 'allow'), + (r'neighbour', 'neighbor'), + (r'vecotr', 'vector'), + (r'computse', 'compute'), + (r'indicies', 'indices'), + (r'accordig', 'according'), + (r'paralell', 'parallel'), + (r'debbug', 'debug'), + # do not suppress the white space here because it would then change "debugging" for instance + (r'debugg ', 'debug '), + (r'debugg$', 'debug'), + (r'familly', 'family'), + (r'warnning', 'warning'), + (r'elemement', 'element'), + (r'cartesion', 'Cartesian'), + (r'partiton', 'partition'), + (r'drection', 'direction'), + (r'seperation', 'separation'), + (r'inverision', 'inversion'), + (r'restauration', 'restoration'), + (r'restaure', 'restore'), + (r'memmory', 'memory'), + (r'convolution formation', 'convolution formulation'), + (r'fortran', 'Fortran'), + (r'adress', 'address'), + (r'gFortran', 'gfortran'), + (r' usefull ', ' useful '), + (r' usefull$', ' useful'), + # enforce upper case + (r'MAX_neighborS', 'MAX_NEIGHBORS'), +] + +# list of regex patterns to be replaced only for selected files w/out excluded files +special_patterns = [ + # operators + (r'\s*<\s*=\s*', ' <= '), + (r'\s*>\s*=\s*', ' >= '), + (r'\s*<\s*', ' < '), + (r'\s*/=\s*', ' /= '), + # restore operators that may have been split by the above introduction of white spaces + (r'<\s*=', '<='), + (r'>\s*=', '>='), + (r'=\s*=', '=='), + (r'/\s*=', '/='), + # also restore bash file pipes that may appear in some print statements that save bash scripts to disk for future processing + (r'>\s*&', '>&'), + (r'<\s*&', '<&'), + # also restore xml-formatting strings '< and >' + (r'\'\s*<\s*', '\'<'), + (r'\s*>\s*\'', '>\''), + # for pointers + (r'\s*=\s*>\s*(?!$)', ' => '), +] + +# patterns for comment/non-comment lines +comment = [ '!' ] +comment_patterns = [ + (r'-\s*>', '->'), + (r'<\s*-', '<-'), +] +non_comment_patterns = [ + (r'(?(?!=)(?!\')(?!&)\s*', ' > '), + (r'\s*==(?!=)\s*', ' == '), + (r'(? 0: + first_letter = line_nospace[0] + else: + first_letter = '' + #print(f"line {i}: first_letter={first_letter} line: {line}") + + ## general patterns + for pattern, replacement in patterns: + #print(f"pattern: {pattern}") + line = re.sub(pattern, replacement, line, flags=re.IGNORECASE) + + ## special patterns formatting (operators,..) + # check if line has xml format (contains ' patterns) + xml_patterns = [ r'\'' ] + has_xml_pattern = False + for pattern in xml_patterns: + if re.search(pattern, line): + has_xml_pattern = True + break + if not has_xml_pattern: + ## Replace special patterns + for pattern, replacement in special_patterns: + line = re.sub(pattern, replacement, line, flags=re.IGNORECASE) + + ## Replace patterns on non-comment lines + if first_letter in comment: + # comment line + for pattern, replacement in comment_patterns: + #print(f"comment pattern: {pattern}") + line = re.sub(pattern, replacement, line, flags=re.IGNORECASE) + else: + # non-comment line + for pattern, replacement in non_comment_patterns: + #print(f"non-comment pattern: {pattern}") + line = re.sub(pattern, replacement, line, flags=re.IGNORECASE) + + ## special formatting + # "write(IMAIN,*)'my-comment'" -> "write(IMAIN,*) 'my-comment'" + if re.search(r'\bwrite\s*\(IMAIN,\*\)\'[^\']*\'', line): + line = re.sub(r'\)(?=\'[^\']*\')', ') ', line, flags=re.IGNORECASE) + + # "write(IMAIN,*)my-parameter" -> "write(*,*) my-parameter" + if re.search(r'\bwrite\s*\(IMAIN,\*\)\w+', line): + line = re.sub(r'\)(?=\w)', ') ', line, flags=re.IGNORECASE) + + # "write(*,*)'my-comment'" -> "write(*,*)'my-comment'" + if re.search(r'\bwrite\s*\(\*,\*\)\'[^\']*\'', line): + line = re.sub(r'\)(?=\'[^\']*\')', ') ', line, flags=re.IGNORECASE) + + # "write(*,*)my-parameter" -> "write(*,*) my-parameter" + if re.search(r'\bwrite\s*\(\*,\*\)\w+', line): + line = re.sub(r'\)(?=\w)', ') ', line, flags=re.IGNORECASE) + + # "if (a==b)something" -> "if (a==b) something" + if re.search(r'\bif\s*\(\s*(\w+)\s*==\s*(\w+)\s*\)\w+', line): + line = re.sub(r'\)(?=\w)', ') ', line, flags=re.IGNORECASE) + + # on non-comment lines + if not first_letter in comment: + # do i=1,.. -> do i = 1,.. + if re.search(r'\bdo\s+(\w+)\s*=(\d+)\s*\,', line): + line = re.sub(r'\bdo\s+(\w+)\s*=(\d+)\s*\,', r'do \1 = \2,', line, flags=re.IGNORECASE) + + # do i=ilat,.. -> do i = ilat,.. + if re.search(r'\bdo\s+(\w+)\s*=(\w+)\s*\,', line): + line = re.sub(r'\bdo\s+(\w+)\s*=(\w+)\s*\,', r'do \1 = \2,', line, flags=re.IGNORECASE) + + # "myvar==0" -> "myvar == 0" + if re.search(r'(\w+)==(\d+)', line): + newline = re.sub(r'(\w+)==(\d+)', r'\1 == \2', line, flags=re.IGNORECASE) + print(" A newline: ",newline) + + # "myvar==something" -> "myvar == something" + if re.search(r'\b(\w+)==(\w+)', line): + newline = re.sub(r'\b(\w+)==(\w+)', r'\1 == \2', line, flags=re.IGNORECASE) + print(" B newline: ",newline) + + # "a=b" -> "a = b" + exclude_equal_patterns = [ + r'==', + r'>=', + r'<=', + r'\bopen\s*\(', + r'\bclose\s*\(', + r'\binquire\s*\(', + r'\bread\s*\(', + r'\bwrite\s*\(', + r'\brandom_seed\s*\(', + r'\bminloc\s*\(', + r'\bminval\s*\(', + r'\bmaxval\s*\(', + r'\bcheck_status\s*\(', + r'\bget_command_argument\s*\(', + r'\bdate_and_time\s*\(', + r'\bexit_mpi\s*\(', + r'\bexit_MPI\s*\(', + r'\blibxsmm', + r'\bprint\s*\*', + r'\brecl=', + r'\bstat=', + r'\bexitstat=', + r'\biostat=', + r'\blen=', + r'\bkind=', + r'\bh5', + r'^[^)]*\)[^)]*$', + r'^\s*&', + ] + has_equal_pattern = False + for pattern in exclude_equal_patterns: + if re.search(pattern, line): + has_equal_pattern = True + break + if not has_equal_pattern: + # "myvar=something" -> "myvar = something" but not "a==b" or lines with "open(unit=.." etc. + if re.search(r'(\w+)=(\w+)', line): + line = re.sub(r'(\w+)=(\w+)', r'\1 = \2', line, flags=re.IGNORECASE) + + # Replace the original line with the modified line + lines[i] = line + + # Join the modified lines back together + content_new = '\n'.join(lines) + return content_new + + +def format_content_general(content): + """ + applies cleaning to general (text) files, for example output_solver.txt files in REF_SEIS/ folders + """ + # line-by-line + lines = content.split('\n') # Split the content into lines + + for i, line in enumerate(lines): + ## general formatting + # suppress trailing white spaces and carriage return + line = re.sub(r'\s*$', '', line) + # Replace the original line with the modified line + lines[i] = line + + # Join the modified lines back together + content_new = '\n'.join(lines) + return content_new + + +def clean_code_format(file): + """ + cleans code format + """ + # Exclude specified files + if any(fnmatch.fnmatch(file, filename) for filename in exclude_files): + return + + # Process files only with specified extensions + # Fortran files + is_Fortran_file = False + is_general_file = False + if any(file.endswith(ext) for ext in fortran_file_extensions): + is_Fortran_file = True + elif any(file.endswith(ext) for ext in general_file_extensions): + is_general_file = True + + # checks if anything to do + if not is_Fortran_file and not is_general_file: + return + + print(f'Processing {file}...') + + # Read the file + with open(file, 'r') as f: + content = f.read() + + # content + if is_Fortran_file: + # fortran code formatting + content_new = format_content_fortran(content) + else: + # general file cleaning + content_new = format_content_general(content) + + # output + if show_diff: + # show all content + #print("content:") + #print(content_new) + + # show differences only line-by-line + # line-by-line + lines_org = content.split('\n') # Split the content into lines + lines_new = content_new.split('\n') # Split the content into lines + + len_org = len(lines_org) + len_new = len(lines_new) + if len(lines_org) != len(lines_new): + print("Warning: content number of lines differ: original = {} new = {}".format(len_org,len_new)) + + length = min(len_org,len_new) + for i in range(length): + line_org = lines_org[i] + line_new = lines_new[i] + # show if lines are different + if line_new != line_org: + print(f" line {i}: - {line_org}") + print(f" line {i}: + {line_new}") + + if replace_file_content: + # Write the modified content back to the file if anything changed + if content_new != content: + # new content is different + with open(file, 'w') as f: + f.write(content_new) + +def clean_listings(folder_filename): + """ + loops over (Fortran) code files and updates code formatting + """ + # determines whether a folder or a specific file was provided as input + if os.path.isdir(folder_filename): + # folder + # Define the path to the source code + src_path = folder_filename + + # Iterate over all files in the source directory and its subdirectories + for root, dirs, files in os.walk(src_path): + # Exclude specified directories + dirs[:] = [d for d in dirs if d not in exclude_dirs and os.path.join(root,d) not in exclude_dirs] + + for file in files: + file_path = os.path.join(root, file) + # clean code formatting + clean_code_format(file_path) + + elif os.path.isfile(folder_filename): + # file + file_path = folder_filename + # clean code formatting + clean_code_format(file_path) + + print("") + print("all done") + print("") + +# reads in arguments +def usage(): + print("Usage: ./clean_listings_specfem.py filename/folder [--diff/--no-diff] [--replace]") + print("") + print(" filename/folder - required input file or folder containing Fortran source code files, e.g., src/") + print(" --diff/--no-diff - show or don't show formatting differences (default is to show differences)") + print(" --replace - replace file content with new formatting (default off)") + sys.exit(1) + + +if __name__ == '__main__': + + # gets arguments + if len(sys.argv) < 2: + usage() + + folder_filename = sys.argv[1] # file or folder with source code files + + # reads arguments + i = 0 + for arg in sys.argv: + i += 1 + #print("arg: ",arg) + # get arguments + if "--diff" in arg: + show_diff = True + elif "--no-diff" in arg: + show_diff = False + elif "--replace" in arg: + replace_file_content = True + elif i > 2: + print("argument not recognized: ",arg) + print("") + usage() + + # main routine + clean_listings(folder_filename)