Skip to content

Commit

Permalink
initial code commit
Browse files Browse the repository at this point in the history
  • Loading branch information
dr-gareth-roberts committed Nov 27, 2024
1 parent c47f3f4 commit 2111b87
Show file tree
Hide file tree
Showing 43 changed files with 12,360 additions and 0 deletions.
270 changes: 270 additions & 0 deletions AdvancedAdversarialDatasetExplorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
import torch
import numpy as np
import pandas as pd
import datasets
from transformers import AutoTokenizer, AutoModel
import concurrent.futures
import itertools
import json
import logging

class AdvancedAdversarialDatasetExplorer:
def __init__(
self,
embedding_model: str = 'sentence-transformers/all-MiniLM-L6-v2'
):
# Configure logging
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)

# Load embedding model
self.tokenizer = AutoTokenizer.from_pretrained(embedding_model)
self.embedding_model = AutoModel.from_pretrained(embedding_model)

# Initialize dataset collections
self.toxic_prompt_datasets = [
'offensive_language/civil_comments',
'hate_speech/hatebase_twitter',
'toxicity/wikipedia_toxicity'
]

self.reasoning_attack_datasets = [
'reasoning/logiqa',
'reasoning/commonsense_qa',
'reasoning/strategy_qa'
]

self.adversarial_datasets = [
'adversarial/universal_adversarial_triggers',
'adversarial/jailbreak_prompts',
'adversarial/red_team_attempts'
]

def _load_dataset(
self,
dataset_name: str,
split: str = 'train'
) -> datasets.Dataset:
"""
Load and cache dataset from HuggingFace
"""
try:
return datasets.load_dataset(dataset_name, split=split)
except Exception as e:
self.logger.error(f"Error loading dataset {dataset_name}: {e}")
return None

def analyze_toxic_prompt_patterns(
self,
toxicity_threshold: float = 0.7
) -> Dict[str, Any]:
"""
Comprehensive toxic prompt pattern analysis
"""
toxic_pattern_analysis = {}

for dataset_name in self.toxic_prompt_datasets:
dataset = self._load_dataset(dataset_name)

if dataset is None:
continue

# Extract toxic prompts
toxic_prompts = [
item['text']
for item in dataset
if item.get('toxicity', 0) > toxicity_threshold
]

# Compute embedding clustering
toxic_embeddings = self._compute_embeddings(toxic_prompts)
clustering_analysis = self._cluster_embeddings(toxic_embeddings)

toxic_pattern_analysis[dataset_name] = {
'total_toxic_prompts': len(toxic_prompts),
'clustering_analysis': clustering_analysis,
'toxicity_distribution': self._compute_toxicity_distribution(dataset)
}

return toxic_pattern_analysis

def explore_reasoning_attack_vectors(
self,
attack_type: str = 'logical_contradiction'
) -> Dict[str, Any]:
"""
Advanced reasoning attack vector exploration
"""
reasoning_attack_analysis = {}

for dataset_name in self.reasoning_attack_datasets:
dataset = self._load_dataset(dataset_name)

if dataset is None:
continue

# Extract reasoning samples
reasoning_samples = [
item['question']
for item in dataset
if 'question' in item
]

# Compute attack potential
attack_potential_scores = self._compute_attack_potential(
reasoning_samples,
attack_type
)

reasoning_attack_analysis[dataset_name] = {
'total_samples': len(reasoning_samples),
'attack_potential_distribution': attack_potential_scores
}

return reasoning_attack_analysis

def generate_adversarial_prompt_variants(
self,
base_prompt: str,
num_variants: int = 10
) -> List[str]:
"""
Generate adversarial prompt variants
"""
# Load adversarial datasets
adversarial_triggers = []
for dataset_name in self.adversarial_datasets:
dataset = self._load_dataset(dataset_name)

if dataset is not None:
adversarial_triggers.extend([
item.get('trigger', '')
for item in dataset
])

# Generate prompt variants
prompt_variants = []
for trigger in itertools.islice(adversarial_triggers, num_variants):
variant = f"{base_prompt} {trigger}"
prompt_variants.append(variant)

return prompt_variants

def _compute_embeddings(
self,
texts: List[str]
) -> np.ndarray:
"""
Compute text embeddings
"""
embeddings = []

for text in texts:
inputs = self.tokenizer(
text,
return_tensors='pt',
padding=True,
truncation=True
)

with torch.no_grad():
outputs = self.embedding_model(**inputs)
embedding = outputs.last_hidden_state.mean(dim=1)
embeddings.append(embedding.numpy())

return np.vstack(embeddings)

def _cluster_embeddings(
self,
embeddings: np.ndarray,
num_clusters: int = 5
) -> Dict[str, Any]:
"""
Cluster embeddings using advanced techniques
"""
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Standardize embeddings
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)

# Apply DBSCAN clustering
clusterer = DBSCAN(eps=0.5, min_samples=3)
cluster_labels = clusterer.fit_predict(scaled_embeddings)

return {
'num_clusters': len(np.unique(cluster_labels)),
'cluster_distribution': np.bincount(cluster_labels + 1)
}

def _compute_attack_potential(
self,
samples: List[str],
attack_type: str
) -> Dict[str, float]:
"""
Compute attack potential for reasoning samples
"""
# Simplified attack potential computation
attack_potential_scores = {}

# Different scoring strategies based on attack type
if attack_type == 'logical_contradiction':
attack_potential_scores = {
'ambiguity_score': np.mean([
len(re.findall(r'\b(maybe|possibly|perhaps)\b', sample.lower()))
for sample in samples
]),
'contradiction_potential': np.mean([
len(re.findall(r'\b(but|however|although)\b', sample.lower()))
for sample in samples
])
}

return attack_potential_scores

def _compute_toxicity_distribution(
self,
dataset: datasets.Dataset
) -> Dict[str, float]:
"""
Compute toxicity distribution
"""
toxicity_scores = [
item.get('toxicity', 0)
for item in dataset
]

return {
'mean_toxicity': np.mean(toxicity_scores),
'median_toxicity': np.median(toxicity_scores),
'max_toxicity': np.max(toxicity_scores)
}

def main():
# Initialize dataset explorer
dataset_explorer = AdvancedAdversarialDatasetExplorer()

# Analyze toxic prompt patterns
toxic_analysis = dataset_explorer.analyze_toxic_prompt_patterns()
print("Toxic Prompt Pattern Analysis:")
print(json.dumps(toxic_analysis, indent=2))

# Explore reasoning attack vectors
reasoning_attacks = dataset_explorer.explore_reasoning_attack_vectors()
print("\nReasoning Attack Vector Analysis:")
print(json.dumps(reasoning_attacks, indent=2))

# Generate adversarial prompt variants
base_prompt = "Explain the principles of ethical AI development"
adversarial_variants = dataset_explorer.generate_adversarial_prompt_variants(
base_prompt,
num_variants=5
)
print("\nAdversarial Prompt Variants:")
for variant in adversarial_variants:
print(variant)

if __name__ == "__main__":
main()
Loading

0 comments on commit 2111b87

Please sign in to comment.