Skip to content

Commit

Permalink
change structure
Browse files Browse the repository at this point in the history
  • Loading branch information
octrow committed May 9, 2024
1 parent 68e1b6d commit da9ed30
Show file tree
Hide file tree
Showing 3 changed files with 226 additions and 133 deletions.
12 changes: 6 additions & 6 deletions gui-customtkinter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import customtkinter as ctk
from tkinter import filedialog # For file dialog
from repoharvester import EXTENSION_GROUPS, main # Import from your repoharvester script
import threading

from repoharvester import RepoHarvester


class RepoHarvesterGUI:
Expand Down Expand Up @@ -117,13 +117,13 @@ def start_process(self):
max_size = self.max_size_entry.get()
exclude_folders = self.exclude_folders_entry.get().replace(" ", "").split(",")
# ... (Other processing logic)

harvester = RepoHarvester()

# Get included/excluded file types
excluded_extensions = set()
for group_name, var in self.file_type_vars.items():
if not var.get(): # If checkbox is unchecked
excluded_extensions.update(EXTENSION_GROUPS[group_name])
excluded_extensions.update(harvester.EXTENSION_GROUPS[group_name])

# Get custom extensions (if entered)
custom_exts = self.custom_ext_entry.get().replace(" ", "").split(",")
Expand All @@ -137,12 +137,12 @@ def start_process(self):
# Create and start a thread for the harvesting process
thread = threading.Thread(target=self.harvesting_thread, args=(repo_url, remove_comments,
excluded_extensions, max_size,
exclude_folders))
exclude_folders, harvester))
thread.start()

def harvesting_thread(self, repo_url, remove_comments, excluded_extensions, max_size, exclude_folders):
def harvesting_thread(self, repo_url, remove_comments, excluded_extensions, max_size, exclude_folders, harvester):
try:
main(repo_url, remove_comments, excluded_extensions, max_size, exclude_folders)
harvester.run_from_gui(repo_url, remove_comments, excluded_extensions, max_size, exclude_folders)
self.status_text.insert("end", "Harvesting completed successfully!\n")
except Exception as e:
self.status_text.insert("end", f"An error occurred: {e}\n")
Expand Down
272 changes: 146 additions & 126 deletions repoharvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,133 +7,153 @@

from comment_pattens import COMMENT_PATTERNS

EXTENSION_GROUPS = {
'media': {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'svg', 'ico', 'raw', 'psd', 'ai'},
'office': {'xlsx', 'xls', 'docx', 'pptx', 'pdf'},
'system': {'pack', 'idx', 'DS_Store', 'sys', 'ini', 'bat', 'plist'},
'executables': {'exe', 'dll', 'so', 'bin'},
'archive': {'zip', 'rar', '7z', 'tar', 'gz', 'bz2'},
'audio': {'mp3', 'wav', 'aac', 'flac'},
'video': {'mp4', 'avi', 'mov', 'wmv', 'flv'},
'database': {'db', 'sqlitedb', 'mdb'},
'font': {'ttf', 'otf', 'woff', 'woff2'},
'temporary': {'tmp', 'temp', 'swp', 'swo'},
'compiled_code': {'o', 'obj', 'pyc', 'class'},
'certificate': {'cer', 'pem', 'crt', 'key'},
'configuration': {'conf', 'cfg', 'config'},
'virtual_env': {'venv', 'env'},
'node_modules': {'node_modules'},
'python_bytecode': {'pyo'},
'package_locks': {'package-lock.json', 'yarn.lock', 'Gemfile.lock'},
'log_files': {'err', 'stderr', 'stdout', 'log',},
'cache_files': {'cache', 'cached'}
}


def get_repo_name(repo_url):
"""Extract the repository name from the URL."""
return repo_url.strip().split('/')[-1].replace('.git', '')

def clone_repository(repo_url, temp_dir):
"""Clone the repository into a temporary directory."""
subprocess.run(['git', 'clone', repo_url, temp_dir], check=True)

def get_file_list(temp_dir, excluded_extensions, max_size, excluded_folders):
"""Walk the directory tree to get the list of files excluding certain extensions, .git, and .github directories."""
file_list = []
for root, dirs, files in os.walk(temp_dir, topdown=True):
dirs[:] = [d for d in dirs if d not in {'.git', '.github'} and d not in excluded_folders] # Skip the .git and .github directories
for file in files:
if file.split('.')[-1] not in excluded_extensions:
file_path = os.path.join(root, file)
file_size_kb = os.path.getsize(file_path) / 1024
if file_size_kb > max_size:
print(f"Skipping file larger than {max_size} KB: {file}, size: {file_size_kb} KB")
continue
elif file_size_kb > 500:
print(f"File larger than 500 KB: {file}, size: {file_size_kb} KB")
file_list.append(os.path.join(root, file))
return file_list

def remove_comments(content, file_extension):
"""Remove comments from the content based on the file extension."""
pattern = COMMENT_PATTERNS.get(file_extension)
if pattern:
content = re.sub(pattern, '', content, flags=re.MULTILINE)
return content

def write_to_union_file(file_list, repo_name, remove_comments_flag, log_file):
output_dir = 'output'
skipped_files = f'{output_dir}/skipped_files.txt'
os.makedirs(output_dir, exist_ok=True)
union_filename = f'{output_dir}/{repo_name}_all_files.txt'

with open(union_filename, 'w', encoding='utf-8') as union_file, \
open(skipped_files, 'w', encoding='utf-8') as skipped_file:

union_file.write(f'## {repo_name}\n')

for file_path in file_list:
filename = os.path.basename(file_path)
file_extension = filename.split('.')[-1]
file_size = os.path.getsize(file_path) / 1024 # Calculate file size in KB
class RepoHarvester:
def __init__(self):
self.EXTENSION_GROUPS = {
'media': {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'svg', 'ico', 'raw', 'psd', 'ai'},
'office': {'xlsx', 'xls', 'docx', 'pptx', 'pdf'},
'system': {'pack', 'idx', 'DS_Store', 'sys', 'ini', 'bat', 'plist'},
'executables': {'exe', 'dll', 'so', 'bin'},
'archive': {'zip', 'rar', '7z', 'tar', 'gz', 'bz2'},
'audio': {'mp3', 'wav', 'aac', 'flac'},
'video': {'mp4', 'avi', 'mov', 'wmv', 'flv'},
'database': {'db', 'sqlitedb', 'mdb'},
'font': {'ttf', 'otf', 'woff', 'woff2'},
'temporary': {'tmp', 'temp', 'swp', 'swo'},
'compiled_code': {'o', 'obj', 'pyc', 'class'},
'certificate': {'cer', 'pem', 'crt', 'key'},
'configuration': {'conf', 'cfg', 'config'},
'virtual_env': {'venv', 'env'},
'node_modules': {'node_modules'},
'python_bytecode': {'pyo'},
'package_locks': {'package-lock.json', 'yarn.lock', 'Gemfile.lock'},
'log_files': {'err', 'stderr', 'stdout', 'log',},
'cache_files': {'cache', 'cached'}
}

def _get_repo_name(self, repo_url):
"""Extract the repository name from the URL."""
return repo_url.strip().split('/')[-1].replace('.git', '')


def _clone_repository(self, repo_url, temp_dir):
"""Clone the repository into a temporary directory."""
subprocess.run(['git', 'clone', repo_url, temp_dir], check=True)

def _get_file_list(self, temp_dir, excluded_extensions, max_size, excluded_folders):
"""Walk the directory tree to get the list of files excluding certain extensions, .git, and .github directories."""
file_list = []
for root, dirs, files in os.walk(temp_dir, topdown=True):
dirs[:] = [d for d in dirs if d not in {'.git', '.github'} and d not in excluded_folders] # Skip the .git and .github directories
for file in files:
if file.split('.')[-1] not in excluded_extensions:
file_path = os.path.join(root, file)
file_size_kb = os.path.getsize(file_path) / 1024
if file_size_kb > max_size:
print(f"Skipping file larger than {max_size} KB: {file}, size: {file_size_kb} KB")
continue
elif file_size_kb > 500:
print(f"File larger than 500 KB: {file}, size: {file_size_kb} KB")
file_list.append(os.path.join(root, file))
return file_list

def _remove_comments(content, file_extension):
"""Remove comments from the content based on the file extension."""
pattern = COMMENT_PATTERNS.get(file_extension)
if pattern:
content = re.sub(pattern, '', content, flags=re.MULTILINE)
return content

def _write_to_union_file(self, file_list, repo_name, remove_comments_flag, log_file):
output_dir = 'output'
skipped_files = f'{output_dir}/skipped_files.txt'
os.makedirs(output_dir, exist_ok=True)
union_filename = f'{output_dir}/{repo_name}_all_files.txt'

with open(union_filename, 'w', encoding='utf-8') as union_file, \
open(skipped_files, 'w', encoding='utf-8') as skipped_file:

union_file.write(f'## {repo_name}\n')

for file_path in file_list:
filename = os.path.basename(file_path)
file_extension = filename.split('.')[-1]
file_size = os.path.getsize(file_path) / 1024 # Calculate file size in KB

try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()

if remove_comments_flag:
content = self._remove_comments(content, file_extension)

union_file.write(f'### {filename}\n')
union_file.write(content)
union_file.write('\n### end of file\n')

logging.info(f"{filename}, size: {file_size:.2f} KB")
except UnicodeDecodeError:
print(f"Skipping non-UTF-8 file: {filename}") # Log skipped file
skipped_file.write(f"{filename}\n") # Write skipped file name to file

return union_filename

def run_from_command_line(self):
parser = argparse.ArgumentParser(description='Clone a repo and compile its contents into a single file.')
parser.add_argument('repo_url', type=str, help='GitHub repository URL (SSH)')
parser.add_argument('-r', '--remove', action='store_true', help='Remove comments from code files')
parser.add_argument('--no-skip', nargs='+', help='Do not skip files of these types')
parser.add_argument('--max-size', type=int, default=1000, help='Maximum file size in KB')
parser.add_argument('--log', type=str, default='output/union_file.log', help='Path to log file')
parser.add_argument('--exclude', nargs='+', default=[], help='Exclude these folders (and their contents)')
args = parser.parse_args()

# Configure logging
logging.basicConfig(filename=args.log, level=logging.INFO,
format='%(message)s')

# Start by excluding all extensions
excluded_extensions = set()
for extensions in self.EXTENSION_GROUPS.values():
excluded_extensions.update(extensions)

# Remove excluded groups if specified in --no-skip
if args.no_skip:
for group in args.no_skip:
if group in self.EXTENSION_GROUPS:
excluded_extensions -= self.EXTENSION_GROUPS[group]

repo_name = self._get_repo_name(args.repo_url)
temp_dir = f'tmp_{repo_name}'
try:
self._clone_repository(args.repo_url, temp_dir)
file_list = self._get_file_list(temp_dir, excluded_extensions, args.max_size, args.exclude)
union_filename = self._write_to_union_file(file_list, repo_name, args.remove, args.log)
print(f'All files have been written to {union_filename}')
finally:
try:
shutil.rmtree(temp_dir)
except OSError as e:
print(f'Error: {e.strerror} - {e.filename}')

def run_from_gui(self, repo_url, remove_comments, excluded_extensions, max_size, exclude_folders, log_file_path='output/union_file.log'):
# Configure logging
logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(message)s')

repo_name = self._get_repo_name(repo_url)
temp_dir = f'tmp_{repo_name}'
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()

if remove_comments_flag:
content = remove_comments(content, file_extension)

union_file.write(f'### {filename}\n')
union_file.write(content)
union_file.write('\n### end of file\n')

logging.info(f"{filename}, size: {file_size:.2f} KB")
except UnicodeDecodeError:
print(f"Skipping non-UTF-8 file: {filename}") # Log skipped file
skipped_file.write(f"{filename}\n") # Write skipped file name to file

return union_filename

def main(repo_url, remove_comments, excluded_extensions, max_size, exclude_folders):
"""Main function to execute the script."""
parser = argparse.ArgumentParser(description='Clone a repo and compile its contents into a single file.')
parser.add_argument('repo_url', type=str, help='GitHub repository URL (SSH)')
parser.add_argument('-r', '--remove', action='store_true', help='Remove comments from code files')
parser.add_argument('--no-skip', nargs='+', help='Do not skip files of these types')
parser.add_argument('--max-size', type=int, default=1000, help='Maximum file size in KB')
parser.add_argument('--log', type=str, default='output/union_file.log', help='Path to log file')
parser.add_argument('--exclude', nargs='+', default=[], help='Exclude these folders (and their contents)')
args = parser.parse_args()

# Configure logging
logging.basicConfig(filename=args.log, level=logging.INFO,
format='%(message)s')

# Start by excluding all extensions
excluded_extensions = set()
for extensions in EXTENSION_GROUPS.values():
excluded_extensions.update(extensions)

# Remove excluded groups if specified in --no-skip
if args.no_skip:
for group in args.no_skip:
if group in EXTENSION_GROUPS:
excluded_extensions -= EXTENSION_GROUPS[group]

repo_name = get_repo_name(args.repo_url)
temp_dir = f'tmp_{repo_name}'
try:
clone_repository(args.repo_url, temp_dir)
file_list = get_file_list(temp_dir, excluded_extensions, args.max_size, args.exclude)
union_filename = write_to_union_file(file_list, repo_name, args.remove, args.log)
print(f'All files have been written to {union_filename}')
finally:
try:
shutil.rmtree(temp_dir)
except OSError as e:
print(f'Error: {e.strerror} - {e.filename}')
self._clone_repository(repo_url, temp_dir)
file_list = self._get_file_list(temp_dir, excluded_extensions, max_size, exclude_folders)
union_filename = self._write_to_union_file(file_list, repo_name, remove_comments, log_file_path)
print(f'All files have been written to {union_filename}')
finally:
try:
shutil.rmtree(temp_dir)
except OSError as e:
print(f'Error: {e.strerror} - {e.filename}')


if __name__ == '__main__':
main()
harvester = RepoHarvester()
harvester.run_from_command_line()
Loading

0 comments on commit da9ed30

Please sign in to comment.