From da9ed30a53eb1369f5110651082a9fca51a9c0bc Mon Sep 17 00:00:00 2001 From: octrow Date: Thu, 9 May 2024 21:57:20 +0500 Subject: [PATCH] change structure --- gui-customtkinter.py | 12 +- repoharvester.py | 272 +++++++++++++++++++++++-------------------- requirements.txt | 75 +++++++++++- 3 files changed, 226 insertions(+), 133 deletions(-) diff --git a/gui-customtkinter.py b/gui-customtkinter.py index 3a2f2d3..d673df8 100644 --- a/gui-customtkinter.py +++ b/gui-customtkinter.py @@ -1,8 +1,8 @@ import customtkinter as ctk from tkinter import filedialog # For file dialog -from repoharvester import EXTENSION_GROUPS, main # Import from your repoharvester script import threading +from repoharvester import RepoHarvester class RepoHarvesterGUI: @@ -117,13 +117,13 @@ def start_process(self): max_size = self.max_size_entry.get() exclude_folders = self.exclude_folders_entry.get().replace(" ", "").split(",") # ... (Other processing logic) - + harvester = RepoHarvester() # Get included/excluded file types excluded_extensions = set() for group_name, var in self.file_type_vars.items(): if not var.get(): # If checkbox is unchecked - excluded_extensions.update(EXTENSION_GROUPS[group_name]) + excluded_extensions.update(harvester.EXTENSION_GROUPS[group_name]) # Get custom extensions (if entered) custom_exts = self.custom_ext_entry.get().replace(" ", "").split(",") @@ -137,12 +137,12 @@ def start_process(self): # Create and start a thread for the harvesting process thread = threading.Thread(target=self.harvesting_thread, args=(repo_url, remove_comments, excluded_extensions, max_size, - exclude_folders)) + exclude_folders, harvester)) thread.start() - def harvesting_thread(self, repo_url, remove_comments, excluded_extensions, max_size, exclude_folders): + def harvesting_thread(self, repo_url, remove_comments, excluded_extensions, max_size, exclude_folders, harvester): try: - main(repo_url, remove_comments, excluded_extensions, max_size, exclude_folders) + harvester.run_from_gui(repo_url, remove_comments, excluded_extensions, max_size, exclude_folders) self.status_text.insert("end", "Harvesting completed successfully!\n") except Exception as e: self.status_text.insert("end", f"An error occurred: {e}\n") diff --git a/repoharvester.py b/repoharvester.py index 2123405..e6fa795 100644 --- a/repoharvester.py +++ b/repoharvester.py @@ -7,133 +7,153 @@ from comment_pattens import COMMENT_PATTERNS -EXTENSION_GROUPS = { - 'media': {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'svg', 'ico', 'raw', 'psd', 'ai'}, - 'office': {'xlsx', 'xls', 'docx', 'pptx', 'pdf'}, - 'system': {'pack', 'idx', 'DS_Store', 'sys', 'ini', 'bat', 'plist'}, - 'executables': {'exe', 'dll', 'so', 'bin'}, - 'archive': {'zip', 'rar', '7z', 'tar', 'gz', 'bz2'}, - 'audio': {'mp3', 'wav', 'aac', 'flac'}, - 'video': {'mp4', 'avi', 'mov', 'wmv', 'flv'}, - 'database': {'db', 'sqlitedb', 'mdb'}, - 'font': {'ttf', 'otf', 'woff', 'woff2'}, - 'temporary': {'tmp', 'temp', 'swp', 'swo'}, - 'compiled_code': {'o', 'obj', 'pyc', 'class'}, - 'certificate': {'cer', 'pem', 'crt', 'key'}, - 'configuration': {'conf', 'cfg', 'config'}, - 'virtual_env': {'venv', 'env'}, - 'node_modules': {'node_modules'}, - 'python_bytecode': {'pyo'}, - 'package_locks': {'package-lock.json', 'yarn.lock', 'Gemfile.lock'}, - 'log_files': {'err', 'stderr', 'stdout', 'log',}, - 'cache_files': {'cache', 'cached'} -} - - -def get_repo_name(repo_url): - """Extract the repository name from the URL.""" - return repo_url.strip().split('/')[-1].replace('.git', '') - -def clone_repository(repo_url, temp_dir): - """Clone the repository into a temporary directory.""" - subprocess.run(['git', 'clone', repo_url, temp_dir], check=True) - -def get_file_list(temp_dir, excluded_extensions, max_size, excluded_folders): - """Walk the directory tree to get the list of files excluding certain extensions, .git, and .github directories.""" - file_list = [] - for root, dirs, files in os.walk(temp_dir, topdown=True): - dirs[:] = [d for d in dirs if d not in {'.git', '.github'} and d not in excluded_folders] # Skip the .git and .github directories - for file in files: - if file.split('.')[-1] not in excluded_extensions: - file_path = os.path.join(root, file) - file_size_kb = os.path.getsize(file_path) / 1024 - if file_size_kb > max_size: - print(f"Skipping file larger than {max_size} KB: {file}, size: {file_size_kb} KB") - continue - elif file_size_kb > 500: - print(f"File larger than 500 KB: {file}, size: {file_size_kb} KB") - file_list.append(os.path.join(root, file)) - return file_list - -def remove_comments(content, file_extension): - """Remove comments from the content based on the file extension.""" - pattern = COMMENT_PATTERNS.get(file_extension) - if pattern: - content = re.sub(pattern, '', content, flags=re.MULTILINE) - return content - -def write_to_union_file(file_list, repo_name, remove_comments_flag, log_file): - output_dir = 'output' - skipped_files = f'{output_dir}/skipped_files.txt' - os.makedirs(output_dir, exist_ok=True) - union_filename = f'{output_dir}/{repo_name}_all_files.txt' - - with open(union_filename, 'w', encoding='utf-8') as union_file, \ - open(skipped_files, 'w', encoding='utf-8') as skipped_file: - - union_file.write(f'## {repo_name}\n') - - for file_path in file_list: - filename = os.path.basename(file_path) - file_extension = filename.split('.')[-1] - file_size = os.path.getsize(file_path) / 1024 # Calculate file size in KB +class RepoHarvester: + def __init__(self): + self.EXTENSION_GROUPS = { + 'media': {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'svg', 'ico', 'raw', 'psd', 'ai'}, + 'office': {'xlsx', 'xls', 'docx', 'pptx', 'pdf'}, + 'system': {'pack', 'idx', 'DS_Store', 'sys', 'ini', 'bat', 'plist'}, + 'executables': {'exe', 'dll', 'so', 'bin'}, + 'archive': {'zip', 'rar', '7z', 'tar', 'gz', 'bz2'}, + 'audio': {'mp3', 'wav', 'aac', 'flac'}, + 'video': {'mp4', 'avi', 'mov', 'wmv', 'flv'}, + 'database': {'db', 'sqlitedb', 'mdb'}, + 'font': {'ttf', 'otf', 'woff', 'woff2'}, + 'temporary': {'tmp', 'temp', 'swp', 'swo'}, + 'compiled_code': {'o', 'obj', 'pyc', 'class'}, + 'certificate': {'cer', 'pem', 'crt', 'key'}, + 'configuration': {'conf', 'cfg', 'config'}, + 'virtual_env': {'venv', 'env'}, + 'node_modules': {'node_modules'}, + 'python_bytecode': {'pyo'}, + 'package_locks': {'package-lock.json', 'yarn.lock', 'Gemfile.lock'}, + 'log_files': {'err', 'stderr', 'stdout', 'log',}, + 'cache_files': {'cache', 'cached'} + } + + def _get_repo_name(self, repo_url): + """Extract the repository name from the URL.""" + return repo_url.strip().split('/')[-1].replace('.git', '') + + + def _clone_repository(self, repo_url, temp_dir): + """Clone the repository into a temporary directory.""" + subprocess.run(['git', 'clone', repo_url, temp_dir], check=True) + + def _get_file_list(self, temp_dir, excluded_extensions, max_size, excluded_folders): + """Walk the directory tree to get the list of files excluding certain extensions, .git, and .github directories.""" + file_list = [] + for root, dirs, files in os.walk(temp_dir, topdown=True): + dirs[:] = [d for d in dirs if d not in {'.git', '.github'} and d not in excluded_folders] # Skip the .git and .github directories + for file in files: + if file.split('.')[-1] not in excluded_extensions: + file_path = os.path.join(root, file) + file_size_kb = os.path.getsize(file_path) / 1024 + if file_size_kb > max_size: + print(f"Skipping file larger than {max_size} KB: {file}, size: {file_size_kb} KB") + continue + elif file_size_kb > 500: + print(f"File larger than 500 KB: {file}, size: {file_size_kb} KB") + file_list.append(os.path.join(root, file)) + return file_list + + def _remove_comments(content, file_extension): + """Remove comments from the content based on the file extension.""" + pattern = COMMENT_PATTERNS.get(file_extension) + if pattern: + content = re.sub(pattern, '', content, flags=re.MULTILINE) + return content + + def _write_to_union_file(self, file_list, repo_name, remove_comments_flag, log_file): + output_dir = 'output' + skipped_files = f'{output_dir}/skipped_files.txt' + os.makedirs(output_dir, exist_ok=True) + union_filename = f'{output_dir}/{repo_name}_all_files.txt' + + with open(union_filename, 'w', encoding='utf-8') as union_file, \ + open(skipped_files, 'w', encoding='utf-8') as skipped_file: + + union_file.write(f'## {repo_name}\n') + + for file_path in file_list: + filename = os.path.basename(file_path) + file_extension = filename.split('.')[-1] + file_size = os.path.getsize(file_path) / 1024 # Calculate file size in KB + + try: + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + + if remove_comments_flag: + content = self._remove_comments(content, file_extension) + + union_file.write(f'### {filename}\n') + union_file.write(content) + union_file.write('\n### end of file\n') + + logging.info(f"{filename}, size: {file_size:.2f} KB") + except UnicodeDecodeError: + print(f"Skipping non-UTF-8 file: {filename}") # Log skipped file + skipped_file.write(f"{filename}\n") # Write skipped file name to file + + return union_filename + + def run_from_command_line(self): + parser = argparse.ArgumentParser(description='Clone a repo and compile its contents into a single file.') + parser.add_argument('repo_url', type=str, help='GitHub repository URL (SSH)') + parser.add_argument('-r', '--remove', action='store_true', help='Remove comments from code files') + parser.add_argument('--no-skip', nargs='+', help='Do not skip files of these types') + parser.add_argument('--max-size', type=int, default=1000, help='Maximum file size in KB') + parser.add_argument('--log', type=str, default='output/union_file.log', help='Path to log file') + parser.add_argument('--exclude', nargs='+', default=[], help='Exclude these folders (and their contents)') + args = parser.parse_args() + + # Configure logging + logging.basicConfig(filename=args.log, level=logging.INFO, + format='%(message)s') + + # Start by excluding all extensions + excluded_extensions = set() + for extensions in self.EXTENSION_GROUPS.values(): + excluded_extensions.update(extensions) + + # Remove excluded groups if specified in --no-skip + if args.no_skip: + for group in args.no_skip: + if group in self.EXTENSION_GROUPS: + excluded_extensions -= self.EXTENSION_GROUPS[group] + + repo_name = self._get_repo_name(args.repo_url) + temp_dir = f'tmp_{repo_name}' + try: + self._clone_repository(args.repo_url, temp_dir) + file_list = self._get_file_list(temp_dir, excluded_extensions, args.max_size, args.exclude) + union_filename = self._write_to_union_file(file_list, repo_name, args.remove, args.log) + print(f'All files have been written to {union_filename}') + finally: + try: + shutil.rmtree(temp_dir) + except OSError as e: + print(f'Error: {e.strerror} - {e.filename}') + + def run_from_gui(self, repo_url, remove_comments, excluded_extensions, max_size, exclude_folders, log_file_path='output/union_file.log'): + # Configure logging + logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(message)s') + repo_name = self._get_repo_name(repo_url) + temp_dir = f'tmp_{repo_name}' try: - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() - - if remove_comments_flag: - content = remove_comments(content, file_extension) - - union_file.write(f'### {filename}\n') - union_file.write(content) - union_file.write('\n### end of file\n') - - logging.info(f"{filename}, size: {file_size:.2f} KB") - except UnicodeDecodeError: - print(f"Skipping non-UTF-8 file: {filename}") # Log skipped file - skipped_file.write(f"{filename}\n") # Write skipped file name to file - - return union_filename - -def main(repo_url, remove_comments, excluded_extensions, max_size, exclude_folders): - """Main function to execute the script.""" - parser = argparse.ArgumentParser(description='Clone a repo and compile its contents into a single file.') - parser.add_argument('repo_url', type=str, help='GitHub repository URL (SSH)') - parser.add_argument('-r', '--remove', action='store_true', help='Remove comments from code files') - parser.add_argument('--no-skip', nargs='+', help='Do not skip files of these types') - parser.add_argument('--max-size', type=int, default=1000, help='Maximum file size in KB') - parser.add_argument('--log', type=str, default='output/union_file.log', help='Path to log file') - parser.add_argument('--exclude', nargs='+', default=[], help='Exclude these folders (and their contents)') - args = parser.parse_args() - - # Configure logging - logging.basicConfig(filename=args.log, level=logging.INFO, - format='%(message)s') - - # Start by excluding all extensions - excluded_extensions = set() - for extensions in EXTENSION_GROUPS.values(): - excluded_extensions.update(extensions) - - # Remove excluded groups if specified in --no-skip - if args.no_skip: - for group in args.no_skip: - if group in EXTENSION_GROUPS: - excluded_extensions -= EXTENSION_GROUPS[group] - - repo_name = get_repo_name(args.repo_url) - temp_dir = f'tmp_{repo_name}' - try: - clone_repository(args.repo_url, temp_dir) - file_list = get_file_list(temp_dir, excluded_extensions, args.max_size, args.exclude) - union_filename = write_to_union_file(file_list, repo_name, args.remove, args.log) - print(f'All files have been written to {union_filename}') - finally: - try: - shutil.rmtree(temp_dir) - except OSError as e: - print(f'Error: {e.strerror} - {e.filename}') + self._clone_repository(repo_url, temp_dir) + file_list = self._get_file_list(temp_dir, excluded_extensions, max_size, exclude_folders) + union_filename = self._write_to_union_file(file_list, repo_name, remove_comments, log_file_path) + print(f'All files have been written to {union_filename}') + finally: + try: + shutil.rmtree(temp_dir) + except OSError as e: + print(f'Error: {e.strerror} - {e.filename}') + if __name__ == '__main__': - main() + harvester = RepoHarvester() + harvester.run_from_command_line() diff --git a/requirements.txt b/requirements.txt index 3080767..037d872 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,85 @@ +annotated-types==0.6.0 +anyio==4.3.0 +beautifulsoup4==4.12.3 +blinker==1.4 +bs4==0.0.2 +build==1.1.1 +CacheControl==0.14.0 certifi==2024.2.2 +cfgv==3.4.0 charset-normalizer==3.3.2 +cleo==2.1.0 +crashtest==0.4.1 +cryptography==3.4.8 +customtkinter==5.2.2 +darkdetect==0.8.0 +dbus-python==1.2.18 +distlib==0.3.8 +distro==1.7.0 +distro-info==1.1+ubuntu0.2 docutils==0.21.2 -idna==3.7 +dulwich==0.21.7 +fastjsonschema==2.19.1 +filelock==3.13.1 +gitdb==4.0.11 +GitPython==3.1.42 +h11==0.14.0 +httpcore==1.0.5 +httplib2==0.20.2 +httpx==0.27.0 +identify==2.5.35 +idna==3.6 +importlib_metadata==7.0.2 +installer==0.7.0 +jaraco.classes==3.3.1 +jeepney==0.7.1 +keyring==24.3.1 Kivy==2.3.0 Kivy-Garden==0.1.5 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +msgpack==1.0.8 +nodeenv==1.8.0 +oauthlib==3.2.0 +packaging==23.2 +pexpect==4.9.0 +pkginfo==1.10.0 +platformdirs==4.2.0 +poetry==1.8.2 +poetry-core==1.9.0 +poetry-plugin-export==1.6.0 +pre-commit==3.6.2 +ptyprocess==0.7.0 pyasn1==0.6.0 +pydantic==2.7.1 +pydantic_core==2.18.2 Pygments==2.17.2 +PyGObject==3.42.1 +PyJWT==2.3.0 +pyparsing==2.4.7 +pyproject_hooks==1.0.0 PySimpleGUI==5.0.4 +python-apt==2.4.0+ubuntu3 +PyYAML==6.0.1 +rapidfuzz==3.6.2 +replicate==0.25.2 requests==2.31.0 +requests-toolbelt==1.0.0 rsa==4.9 +SecretStorage==3.3.1 +shellingham==1.5.4 +six==1.16.0 +smmap==5.0.1 +sniffio==1.3.1 +soupsieve==2.5 +tomlkit==0.12.4 +trove-classifiers==2024.3.3 +typing_extensions==4.11.0 +ufw==0.36.1 +unattended-upgrades==0.1 urllib3==2.2.1 +virtualenv==20.25.1 +wadllib==1.3.6 +zipp==1.0.0