-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_ethical_data.py
326 lines (264 loc) · 12 KB
/
generate_ethical_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#!/usr/bin/env python
"""
Generate ethical reasoning training data from CommonCrawl samples.
This script analyzes downloaded CommonCrawl data to identify content with
potential ethical implications, then generates training examples with
detailed ethical reasoning.
Usage:
python generate_ethical_data.py [--input INPUT_DIR] [--output OUTPUT_FILE] [--count COUNT]
"""
import os
import json
import glob
import argparse
import random
import re
import requests
from tqdm import tqdm
# Constants
DEFAULT_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "jsonl")
DEFAULT_OUTPUT_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "ethical_training.jsonl")
DEFAULT_COUNT = 20
OLLAMA_API = "http://localhost:11434/api/generate"
DEFAULT_MODEL = "deepseek-r1"
# Ethical categories and keywords
ETHICAL_CATEGORIES = {
"fairness": [
"discrimination", "bias", "prejudice", "equality", "unfair", "unjust", "privilege",
"favoritism", "equitable", "impartial"
],
"harm": [
"hurt", "damage", "injury", "pain", "suffering", "abuse", "mistreat", "exploit",
"harm", "unsafe", "danger", "violent"
],
"autonomy": [
"freedom", "choice", "consent", "coercion", "manipulation", "force", "control",
"privacy", "liberty", "self-determination", "agency"
],
"deception": [
"lie", "deceive", "mislead", "trick", "fraud", "false", "fake", "dishonest",
"cheat", "scam", "misinformation", "disinformation"
],
"justice": [
"fair", "rights", "deserve", "punishment", "reward", "compensation", "retribution",
"law", "legal", "illegal", "crime", "equitable"
]
}
# Combined keywords for initial filtering
ALL_ETHICAL_KEYWORDS = []
for keywords in ETHICAL_CATEGORIES.values():
ALL_ETHICAL_KEYWORDS.extend(keywords)
# Remove duplicates
ALL_ETHICAL_KEYWORDS = list(set(ALL_ETHICAL_KEYWORDS))
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description="Generate ethical reasoning training data")
parser.add_argument("--input", type=str, default=DEFAULT_DATA_DIR,
help=f"Directory containing JSONL files (default: {DEFAULT_DATA_DIR})")
parser.add_argument("--output", type=str, default=DEFAULT_OUTPUT_FILE,
help=f"Output file for training data (default: {DEFAULT_OUTPUT_FILE})")
parser.add_argument("--count", type=int, default=DEFAULT_COUNT,
help=f"Number of examples to generate (default: {DEFAULT_COUNT})")
parser.add_argument("--model", type=str, default=DEFAULT_MODEL,
help=f"Ollama model to use (default: {DEFAULT_MODEL})")
return parser.parse_args()
def ensure_dir(file_path):
"""Create directory for file if it doesn't exist"""
directory = os.path.dirname(file_path)
if not os.path.exists(directory):
os.makedirs(directory)
def load_cc_data(input_dir):
"""Load CommonCrawl data from JSONL files"""
all_data = []
jsonl_files = glob.glob(os.path.join(input_dir, "*.jsonl"))
if not jsonl_files:
print(f"No JSONL files found in {input_dir}")
return []
for file_path in jsonl_files:
print(f"Loading data from {file_path}...")
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
try:
record = json.loads(line.strip())
all_data.append(record)
except json.JSONDecodeError:
continue
print(f"Loaded {len(all_data)} documents from {len(jsonl_files)} files")
return all_data
def extract_ethical_content(documents, keywords=ALL_ETHICAL_KEYWORDS, max_length=1000):
"""Extract passages that contain ethical keywords"""
ethical_passages = []
for doc in tqdm(documents, desc="Scanning for ethical content"):
text = doc.get('text', '')
# Skip very short texts
if len(text) < 100:
continue
# Check for ethical keywords
matches = []
for keyword in keywords:
pattern = r'\b' + re.escape(keyword) + r'\b'
for match in re.finditer(pattern, text.lower()):
# Get context around the match
start = max(0, match.start() - 200)
end = min(len(text), match.end() + 200)
matches.append((match.start(), text[start:end]))
# If matches found, add to ethical passages
if matches:
# Sort by position in text
matches.sort(key=lambda x: x[0])
# Take up to 3 matches from this document
for _, passage in matches[:3]:
if len(passage) > max_length:
passage = passage[:max_length] + "..."
ethical_passages.append({
'passage': passage,
'url': doc.get('url', ''),
'domain': doc.get('domain', '')
})
print(f"Found {len(ethical_passages)} passages with ethical content")
return ethical_passages
def categorize_ethical_content(passages):
"""Determine primary ethical category for each passage"""
categorized = []
for passage in tqdm(passages, desc="Categorizing content"):
text = passage['passage'].lower()
# Count matches for each category
category_counts = {}
for category, keywords in ETHICAL_CATEGORIES.items():
count = 0
for keyword in keywords:
pattern = r'\b' + re.escape(keyword) + r'\b'
count += len(re.findall(pattern, text))
category_counts[category] = count
# Choose primary category (highest count)
if sum(category_counts.values()) > 0:
primary_category = max(category_counts.items(), key=lambda x: x[1])[0]
passage['category'] = primary_category
categorized.append(passage)
return categorized
def generate_ethical_reasoning(passage, category, model=DEFAULT_MODEL, timeout=60):
"""Generate ethical reasoning for a passage using Ollama API"""
# Truncate passage if it's too long
if len(passage) > 1500:
passage = passage[:1500] + "..."
system_prompt = f"""You are an ethical reasoning assistant trained to analyze text passages and provide detailed ethical reasoning.
For the given text passage, identify ethical considerations related to {category} and develop a step-by-step ethical reasoning process.
Analyze the implications thoroughly and consider multiple perspectives.
Structure your reasoning into two main sections with the following format:
<|begin_of_thought|>
(Your step-by-step ethical analysis here, analyzing the ethical implications in detail)
<|end_of_thought|>
<|begin_of_solution|>
(Your final ethical assessment and recommendation, summarizing the key points from your analysis)
<|end_of_solution|>
Keep your response concise but insightful, focusing on the most important ethical considerations."""
user_prompt = f"""Analyze the following text passage from an ethical perspective, focusing especially on considerations related to {category}:
{passage}
Provide ethical reasoning following the format I specified."""
try:
response = requests.post(
OLLAMA_API,
json={
"model": model,
"system": system_prompt,
"prompt": user_prompt,
"stream": False
},
timeout=timeout
)
if response.status_code == 200:
result = response.json()
return result.get("response", "")
else:
return f"Error generating ethical reasoning: {response.status_code}"
except requests.exceptions.Timeout:
return "Request timed out while generating ethical reasoning"
except Exception as e:
return f"Exception during generation: {str(e)}"
def generate_training_data(passages, count, model=DEFAULT_MODEL):
"""Generate training examples with ethical reasoning"""
# Ensure we have enough passages
if len(passages) < count:
print(f"Warning: Only {len(passages)} passages available, less than requested {count}")
count = min(len(passages), count)
# Select a diverse set of passages
selected_passages = []
categories = list(ETHICAL_CATEGORIES.keys())
# Try to get an even distribution across categories
for category in categories:
category_passages = [p for p in passages if p.get('category') == category]
per_category = max(1, count // len(categories))
if len(category_passages) > 0:
selected_passages.extend(random.sample(
category_passages,
min(per_category, len(category_passages))
))
# If we still need more passages, add randomly
if len(selected_passages) < count:
remaining = [p for p in passages if p not in selected_passages]
if remaining:
selected_passages.extend(random.sample(
remaining,
min(count - len(selected_passages), len(remaining))
))
# Limit to requested count
selected_passages = selected_passages[:count]
# Generate ethical reasoning for each passage
training_data = []
print(f"Generating ethical reasoning for {len(selected_passages)} passages")
for i, passage in enumerate(selected_passages):
category = passage.get('category', 'general ethics')
# Progress display
print(f"Processing example {i+1}/{len(selected_passages)}: {category}")
# Generate reasoning with timeout protection
try:
reasoning = generate_ethical_reasoning(passage['passage'], category, model, timeout=90)
# Create training example
example = {
"passage": passage['passage'],
"category": category,
"url": passage.get('url', ''),
"domain": passage.get('domain', ''),
"reasoning": reasoning
}
training_data.append(example)
# Save example immediately to avoid losing work if script crashes
with open(f"data/ethical_examples_{i+1}.json", 'w', encoding='utf-8') as f:
json.dump(example, f, indent=2)
except Exception as e:
print(f"Error processing example {i+1}: {str(e)}")
continue
return training_data
def save_training_data(training_data, output_file):
"""Save training data to a JSONL file"""
ensure_dir(output_file)
with open(output_file, 'w', encoding='utf-8') as f:
for example in training_data:
f.write(json.dumps(example) + '\n')
print(f"Saved {len(training_data)} training examples to {output_file}")
def main():
args = parse_args()
# Check if Ollama API is available
try:
response = requests.get("http://localhost:11434/api/tags")
if response.status_code != 200:
print("Warning: Ollama API doesn't seem to be available. Generation may fail.")
except Exception:
print("Warning: Ollama API doesn't seem to be available. Generation may fail.")
# Load data
documents = load_cc_data(args.input)
if not documents:
print("No documents found. Please run the download script first.")
return
# Process data
print(f"Extracting ethical content from {len(documents)} documents...")
ethical_passages = extract_ethical_content(documents)
print(f"Categorizing {len(ethical_passages)} ethical passages...")
categorized_passages = categorize_ethical_content(ethical_passages)
print(f"Generating {args.count} training examples...")
training_data = generate_training_data(categorized_passages, args.count, args.model)
# Save training data
save_training_data(training_data, args.output)
print("Done!")
if __name__ == "__main__":
main()