本次是機器學習第二次作業,訓練一個影像辨識模型,能夠辨識出 The Simpsons 動畫影集中 50 位角色。我們採用 卷積神經網路 (Convolutional Neural Network, CNN) 作為模型基礎,進行角色的影像辨識。
Kaggle Competition:點擊這裡 競賽連結
準確率在 0.86073 左右
- abraham_grampa_simpson (阿伯拉罕·辛普森爺爺)
- agnes_skinner (艾格尼斯·斯金納)
- apu_nahasapeemapetilon (阿普)
- barney_gumble (巴尼·岡博)
- bart_simpson (巴特·辛普森)
- brandine_spuckler (布蘭丁·斯帕克勒)
- carl_carlson (卡爾·卡爾森)
- charles_montgomery_burns (蒙哥馬利·伯恩斯先生)
- chief_wiggum (維格姆警長)
- cletus_spuckler (克萊圖斯·斯帕克勒)
- comic_book_guy (漫畫書店老闆)
- disco_stu (迪斯可·史都)
- dolph_starbeam (道夫·斯塔比姆)
- duff_man (達夫啤酒人)
- edna_krabappel (艾德娜·克拉巴佩爾老師)
- fat_tony (胖東尼)
- gary_chalmers (加里·查莫斯)
- gil (吉爾)
- groundskeeper_willie (威利園丁)
- homer_simpson (霍默·辛普森)
- About 2000 images per character
- Pictures are under various size, scenes and distortions
- not necessarily centered in each image and could sometimes be with or cropped from other characters
!pip install torch
!pip install torchvision
!pip install timm
!pip install pandas
!pip install matplotlib
import torch
from torchvision import datasets, transforms
import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader
import os
import torchvision.transforms.v2 as T
from PIL import Image
# Custom transform to add Gaussian noise
class AddGaussianNoise(object):
def __init__(self, mean=0., std=1.):
self.std = std
self.mean = mean
def __call__(self, tensor):
return tensor + torch.randn(tensor.size()) * self.std + self.mean
# Custom transform to add Speckle noise
class AddSpeckleNoise(object):
Add speckle noise to the image.
def __init__(self, noise_level=0.1):
:param noise_level: Standard deviation of the noise distribution
self.noise_level = noise_level
def __call__(self, tensor):
:param tensor: PyTorch tensor, the image on which noise is added
:return: PyTorch tensor, image with speckle noise
# Generate speckle noise
noise = torch.randn_like(tensor) * self.noise_level
# Add speckle noise to the image
noisy_tensor = tensor * (1 + noise)
# Clip the values to be between 0 and 1
noisy_tensor = torch.clamp(noisy_tensor, 0, 1)
return noisy_tensor
class AddPoissonNoise(object):
Add Poisson noise to the image.
def __init__(self, lam=1.0):
:param lam: Lambda parameter for Poisson distribution
self.lam = lam
def __call__(self, tensor):
:param tensor: PyTorch tensor, the image to which noise is added
:return: PyTorch tensor, image with Poisson noise
# Generate Poisson noise
noise = torch.poisson(self.lam * torch.ones(tensor.shape))
# Add Poisson noise to the image
noisy_tensor = tensor + noise / 255.0 # Assuming the image is scaled between 0 and 1
# Clip the values to be between 0 and 1
noisy_tensor = torch.clamp(noisy_tensor, 0, 1)
return noisy_tensor
# Custom transform to add Salt and Pepper noise
class AddSaltPepperNoise(object):
def __init__(self, salt_prob=0.05, pepper_prob=0.05):
self.salt_prob = salt_prob
self.pepper_prob = pepper_prob
def __call__(self, tensor):
noise = torch.rand(tensor.size())
tensor = tensor.clone() # Clone the tensor to avoid modifying the original
# Apply salt noise: setting some pixels to 1
tensor[noise < self.salt_prob] = 1
# Apply pepper noise: setting some pixels to 0
tensor[noise > 1 - self.pepper_prob] = 0
return tensor
# Define the image augmentation transformations
transform = T.Compose([
T.Resize(256), # 調整圖像大小
T.CenterCrop(224), # 中心裁剪至 224x224
T.ToTensor(), # 轉換為 Tensor
T.RandomHorizontalFlip(p=0.1), # 水平翻轉
T.RandomVerticalFlip(p=0.1), # 垂直翻轉
T.RandomRotation(10), # 隨機旋轉
T.ColorJitter(0.4, 0.4, 0.4, 0.1), # 色彩抖動
T.RandomGrayscale(p=0.1), # 灰階
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # 正規化 (ViT 使用 ImageNet 的均值與標準差)
batch_size = 32
# 載入完整的訓練資料集
full_dataset = datasets.ImageFolder(root='./train/train', transform=transform)
# 設置拆分比例(例如 80% 用於訓練,20% 用於驗證)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
# 使用 random_split 進行拆分
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
# 建立 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
# # 檢查資料集類別
class_names = full_dataset.classes
from torchvision.models import vit_b_16
# 使用 torchvision 提供的 ViT 模型
model = vit_b_16(pretrained=True).to(device)
# 替換分類頭
in_features = model.heads.head.in_features # ViT 的輸出特徵數
num_classes = len(class_names) # 類別數量
model.heads.head = nn.Sequential(
nn.Linear(in_features, 256), # 隱藏層
nn.Linear(256, num_classes) # 輸出層
import torch.optim as optim
# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss() # 分類任務中的標準損失函數
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4) # good
num_epochs = 30 # 訓練的回合數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # 假設使用 Adam 優化器
criterion = torch.nn.CrossEntropyLoss() # 假設使用交叉熵損失
for epoch in range(num_epochs):
model.train() # 訓練模式
running_loss = 0.0
for inputs, labels in train_loader:
# 確保圖像與標籤移動至正確的設備
inputs, labels = inputs.to(device), labels.to(device)
# 前向傳播與梯度更新
outputs = model(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item()
# 訓練損失輸出
print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader):.4f}")
model.eval() # 設置模型為評估模式
correct = 0
total = 0
with torch.no_grad(): # 關閉梯度計算,以節省記憶體
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
# print(outputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f'Validation Accuracy: {accuracy:.2f}%')
# 定義圖像的轉換
transform = T.Compose([
T.Resize(256), # 調整圖像大小
T.CenterCrop(224), # 中心裁剪
T.ToTensor(), # 轉換為張量
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # 正規化
# 自定義排序函數
def numeric_sort(img_list):
return sorted(img_list, key=lambda x: int(os.path.splitext(os.path.basename(x[0]))[0]))
# 加載測試資料集
test_dataset = datasets.ImageFolder(root='./test-final', transform=transform)
test_dataset.samples = numeric_sort(test_dataset.samples) # 按數值排序
test_dataset.imgs = test_dataset.samples # 更新 imgs 以符合排序結果
# 建立測試 DataLoader
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
# 模型評估模式
predictions = [] # 初始化預測結果
# 遍歷測試集並生成預測
with torch.no_grad():
for i, (inputs, _) in enumerate(test_loader, start=1): # start=1 表示 ID 從 1 開始
inputs = inputs.to(device)
outputs = model(inputs) # 前向傳播
_, predicted = torch.max(outputs, 1) # 獲取每個樣本的預測類別索引
# 保存 batch 中的預測結果
for idx, pred in enumerate(predicted):
class_name = class_names[pred.item()] # 根據索引獲取類別名稱
predictions.append({'id': (i - 1) * batch_size + idx + 1, 'character': class_name}) # 計算全域 ID
# 保存提交檔案
submission_df = pd.DataFrame(predictions)
submission_df.to_csv('submission-new.csv', index=False)
print("Submission file 'submission-new.csv' generated successfully!")