-
Notifications
You must be signed in to change notification settings - Fork 1
/
markdown_code_parser.py
110 lines (88 loc) · 3.58 KB
/
markdown_code_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re
import os
import sys
def parse_markdown_code(input_file, output_dir):
"""
Parse a markdown file and extract code blocks with their preceding filenames.
Args:
input_file (str): Path to the input markdown file
output_dir (str): Directory to save the extracted code files
Returns:
list: List of generated file paths
"""
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Store generated file paths
generated_files = []
# Read the entire markdown file
with open(input_file, 'r', encoding='utf-8') as f:
content = f.read()
# Regular expression to match code blocks with optional language specification
code_block_pattern = r'```(?:\w+)?\n(.*?)```'
# Find all code blocks
code_blocks = re.findall(code_block_pattern, content, re.DOTALL)
# Remove leading numbers and dots from filenames
def sanitize_filename(filename):
# Remove leading numbering like "1. " or "1._"
filename = re.sub(r'^[\d]+\.\s*_?', '', filename)
# Sanitize remaining filename
filename = re.sub(r'[^\w\-\. ]', '', filename)
filename = filename.strip().replace(' ', '_')
return filename if filename else 'code'
# Find preceding lines for potential filenames
for match in re.finditer(code_block_pattern, content, re.DOTALL):
# Get the full match and the start position
full_match = match.group(0)
start_pos = match.start()
# Extract the code block
code_block = match.group(1).strip()
# Find the line preceding the code block
preceding_text = content[:start_pos].split('\n')
# Use the last non-empty line before the code block as the filename
filename = 'code.txt'
for line in reversed(preceding_text):
if line.strip():
filename = sanitize_filename(line.strip())
break
# Ensure unique filename
base, ext = os.path.splitext(filename)
ext = ext if ext else '.txt'
counter = 1
full_filename = os.path.join(output_dir, f"{base}{ext}")
while os.path.exists(full_filename):
full_filename = os.path.join(output_dir, f"{base}_{counter}{ext}")
counter += 1
# Write the code block to the file
with open(full_filename, 'w', encoding='utf-8') as f:
f.write(code_block)
generated_files.append(full_filename)
return generated_files
def main():
# Check if input file is provided
if len(sys.argv) < 2:
print("Usage: python markdown_code_parser.py <input_markdown_file>")
sys.exit(1)
input_markdown = sys.argv[1]
output_directory = 'extracted_code'
try:
generated_files = parse_markdown_code(input_markdown, output_directory)
print("Generated files:")
for file in generated_files:
print(os.path.relpath(file, output_directory))
except FileNotFoundError:
print(f"Error: Input file {input_markdown} not found.")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == '__main__':
main()
# ------ run ---------
# python markdown_code_parser.py ~/Desktop/ua_list_vscode.md
# Generated files:
# Ill_help_you_create_a_VSCode_extension_for_an_AI_chat_tool_that_can_maintain_conversation_history._Ill_break_this_down_into_several_files_and_use_TypeScript_with_the_VSCode_extension_API.
# package.json
# constants.ts
# chatProvider.ts
# chatView.ts
# extension.ts
# tsconfig.json
#