Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
aimspot committed Jun 5, 2024
1 parent 0d74bd8 commit 1f58486
Show file tree
Hide file tree
Showing 19 changed files with 141 additions and 389 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
115 changes: 115 additions & 0 deletions src/data_processing/ml_processing/annotation_analysis.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,57 @@
import pandas as pd
from collections import Counter
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
from src.data_processing.data_utils.utils import load_class_names
from src.data_processing.ml_processing.plots import plot_class_balance
import numpy as np
import os
import cv2
import csv

def dumpCSV(class_names, class_labels, dict_class_labels, run_path):
for key, value in dict_class_labels.items():
dict_class_labels[key] = Counter(value)
dict_class_labels['all'] = Counter(class_labels)


for key, value in dict_class_labels.items():
for class_name in class_names:
if class_name not in value.keys():
value.update({f'{class_name}': 0})
csv_file_path = run_path / 'class_counts.csv'
file_exists = csv_file_path.is_file()

with open(csv_file_path, 'a', newline='') as csvfile:
field_names = ['class-name']
for key in dict_class_labels:
field_names.append(f'{key}-count')
writer = csv.DictWriter(csvfile, fieldnames=field_names)

if not file_exists:
writer.writeheader()
all_values = dict()
for class_name in class_names:
values = list()
for class_value in dict_class_labels.values():
for key, value in class_value.items():
if key == class_name:
values.append(value)
all_values[class_name] = values

sorted_dict = reversed(sorted(dict_class_labels['all'].items(), key=lambda x: x[1]))

for class_key, class_value in sorted_dict:
for key, value in all_values.items():
if key == class_key:
if len(field_names) == 5:
writer.writerow({field_names[0]: key, field_names[1]: value[0], field_names[2]: value[1], field_names[3]: value[2], field_names[4]: value[3]})
if len(field_names) == 4:
writer.writerow({field_names[0]: key, field_names[1]: value[0], field_names[2]: value[1], field_names[3]: value[2]})
if len(field_names) == 3:
writer.writerow({field_names[0]: key, field_names[1]: value[0], field_names[2]: value[1]})


def calculate_iou(bbox1, bbox2):
"""
Expand Down Expand Up @@ -74,4 +124,69 @@ def analysis_yolo_annotations(annotation_paths):
'Average Objects Per Image': avg_objects_per_image,
'Average Overlap': sum(overlaps) / len(overlaps) if overlaps else 0,
}
return analysis_results


def load_yolo_labels(annotations_path, class_names):
""" Загрузка меток классов из YOLO аннотаций. """
dict_labels = dict()
labels = list()
for filename in annotations_path:
name_foler = list(Path(filename).parts)[-3]
if filename.endswith('.txt'):
with open(filename, 'r') as file:
for line in file:
parts = line.split()
if len(parts) >= 5:
class_id = int(parts[0])
labels.append(class_names[class_id])
dict_labels[name_foler] = labels
return dict_labels, labels


def gini_coefficient(labels):
unique, counts = np.unique(labels, return_counts=True)
class_counts = dict(zip(unique, counts))
total_examples = len(labels)
gini = 0
for label in class_counts:
label_prob = class_counts[label] / total_examples
gini += label_prob * (1 - label_prob)
return gini


def calculate_class_imbalance(labels):
class_counts = Counter(labels)
max_count = max(class_counts.values())
average_count = sum(class_counts.values()) / len(class_counts)
overall_imbalance = max_count / average_count
return overall_imbalance


def get_image_size(image_path):
image = cv2.imread(image_path)
if image is not None:
height, width, _ = image.shape
return width, height
return None


def analysis_stats(images_path, annotations_path, classes_path, run_path):
class_names = load_class_names(classes_path)
dict_labels, class_labels = load_yolo_labels(annotations_path, class_names)
gini = "{:.2f}".format(gini_coefficient(class_labels))
plot_class_balance(class_labels, run_path)
dumpCSV(class_names, class_labels, dict_labels, run_path)
imbalance_ratio = calculate_class_imbalance(class_labels)
image_count = len(images_path)
number_of_classes = len(set(class_labels))
img_w, img_h = get_image_size(images_path[0])
analysis_results = {
'W': img_w,
'H': img_h,
'Class Imbalance Gini': gini,
'Class Imbalance Ratio': imbalance_ratio,
'Number of images': image_count,
'Number of classes': number_of_classes,
}
return analysis_results
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
file = Path(__file__).resolve()

from src.data_processing.ml_processing.plots import plot_class_balance
from src.data_processing.ml_processing.annotation_analysis import analysis_yolo_annotations
from src.data_processing.ml_processing.image_analysis import analysis_stats, analysis_image_dataset
from src.data_processing.ml_processing.annotation_analysis import analysis_yolo_annotations, analysis_stats
from src.data_processing.ml_processing.image_analysis import analysis_image_dataset


def find_paths(data_path, image_mode = True): #info_processor older find_image
Expand Down Expand Up @@ -45,7 +45,6 @@ def feature_extraction(dataset_path, classes_path, run_path):
df_dataset_features = pd.concat([df_analyze_color_stats, df_color_stats, df_analyze_annotations, df_analyze_stats], axis=1)
df_dataset_features.to_csv(run_path / 'dataset_features.csv', index=False)


return df_dataset_features


Expand Down
62 changes: 0 additions & 62 deletions src/data_processing/ml_processing/image_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from src.data_processing.data_utils.utils import load_class_names
from src.data_processing.ml_processing.plots import plot_class_balance
from collections import Counter
from tqdm import tqdm
import cv2
Expand Down Expand Up @@ -138,66 +136,6 @@ def compute_hist_and_stats(channel, bins=256, range=(0, 256), is_normalized=True
return stats_dict


def gini_coefficient(labels):
unique, counts = np.unique(labels, return_counts=True)
class_counts = dict(zip(unique, counts))
total_examples = len(labels)
gini = 0
for label in class_counts:
label_prob = class_counts[label] / total_examples
gini += label_prob * (1 - label_prob)
return gini

def calculate_class_imbalance(labels):
class_counts = Counter(labels)
max_count = max(class_counts.values())
average_count = sum(class_counts.values()) / len(class_counts)
overall_imbalance = max_count / average_count
return overall_imbalance


def get_image_size(image_path):
image = cv2.imread(image_path)
if image is not None:
height, width, _ = image.shape
return width, height
return None


def load_yolo_labels(annotations_path, class_names):
""" Загрузка меток классов из YOLO аннотаций. """
dict_labels = dict()
labels = list()
for filename in annotations_path:
if filename.endswith('.txt'):
with open(filename, 'r') as file:
for line in file:
parts = line.split()
if len(parts) >= 5:
class_id = int(parts[0])
labels.append(class_names[class_id])
return labels

def analysis_stats(images_path, annotations_path, classes_path, run_path):
class_names = load_class_names(classes_path)
class_labels = load_yolo_labels(annotations_path, class_names)
gini = "{:.2f}".format(gini_coefficient(class_labels))
plot_class_balance(class_labels, run_path)
imbalance_ratio = calculate_class_imbalance(class_labels)
image_count = len(images_path)
number_of_classes = len(set(class_labels))
img_w, img_h = get_image_size(images_path[0])
analysis_results = {
'W': img_w,
'H': img_h,
'Class Imbalance Gini': gini,
'Class Imbalance Ratio': imbalance_ratio,
'Number of images': image_count,
'Number of classes': number_of_classes,
}
return analysis_results


def analysis_image_dataset(images_path):
analyze_color_stats = []
diversity_list = []
Expand Down
16 changes: 7 additions & 9 deletions src/data_processing/ml_processing/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def plot_class_balance(labels, output_path):



def plot_with_lines_and_predictions(train_pca, test_pca, train_labels, names_test, predicted_labels, names_train, ax, title, encoder):
def plot_with_lines_and_predictions(train_umap, test_umap, train_labels, names_test, predicted_labels, names_train, ax, title, encoder):
unique_labels = np.unique(train_labels)
colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels)))
legend_elements = []
Expand All @@ -30,15 +30,15 @@ def plot_with_lines_and_predictions(train_pca, test_pca, train_labels, names_tes

for k, col in zip(unique_labels, colors):
class_member_mask = (train_labels == k)
xy = train_pca[class_member_mask]
xy = train_umap[class_member_mask]
ax.scatter(xy[:, 0], xy[:, 1], s=50, c=[col], edgecolor='black', alpha=0.75)
if len(xy) > 1:
center = np.mean(xy, axis=0)
radius = np.max(np.linalg.norm(xy - center, axis=1))
circle = plt.Circle(center, radius, color=col, fill=False, lw=2, linestyle='--')
ax.add_patch(circle)

for i, point in enumerate(test_pca):
for i, point in enumerate(test_umap):
pred_label = predicted_labels[i]
color = colors[unique_labels.tolist().index(pred_label)] if pred_label in unique_labels else 'gray'

Expand All @@ -48,20 +48,18 @@ def plot_with_lines_and_predictions(train_pca, test_pca, train_labels, names_tes
bbox_edgecolor = 'black'
ax.text(point[0], point[1], names_test, fontsize=9, color='green', ha='center', va='bottom', bbox=dict(boxstyle="round,pad=0.3", facecolor='white', edgecolor=bbox_edgecolor, lw=1))

train_idx = names_train[names_train == encoder.inverse_transform([pred_label.ravel()])[0]].index[0]
train_idx = names_train[names_train == encoder.inverse_transform(pred_label)[0]].index[0]


train_point = train_pca[train_idx]
train_point = train_umap[train_idx]
ax.plot([point[0], train_point[0]], [point[1], train_point[1]], 'k--', linewidth=1)
ax.text(train_point[0], train_point[1], names_train.iloc[train_idx], fontsize=9, color='black', ha='right', va='top')

true_patch = mpatches.Patch(edgecolor='black', facecolor='white', label='True', lw=1)
false_patch = mpatches.Patch(edgecolor='red', facecolor='white', label='False', lw=1)
legend_elements.append(true_patch)
legend_elements.append(false_patch)
ax.set_title(title)
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')
ax.set_xlabel('UMAP 1')
ax.set_ylabel('UMAP 2')
ax.legend(handles=legend_elements, loc='upper right', fontsize='small')


11 changes: 5 additions & 6 deletions src/data_processing/ml_processing/recommendation_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@


def ml_predict(df_rules, df_dataset_features, run_path):

scaler = StandardScaler()
encoder = LabelEncoder()
mds = umap.UMAP(random_state=42)
mds = umap.UMAP()

cols_to_drop = [col for col in df_rules.columns if col.startswith(('Min', 'Max'))]
df_rules = df_rules.drop(columns=cols_to_drop)
Expand All @@ -28,10 +27,10 @@ def ml_predict(df_rules, df_dataset_features, run_path):
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = encoder.fit_transform(y_train)
y_train = encoder.fit_transform(y_train.values.ravel())

X_pca = mds.fit_transform(X_train) # for plot
X_pca_test = mds.transform(X_test) # for plot
X_umap = mds.fit_transform(X_train) # for plot
X_umap_test = mds.transform(X_test) # for plot
train_dataset_names = df_rules['Dataset']

model = cat.CatBoostClassifier(iterations=100, learning_rate=0.1, random_strength=6, verbose=0)
Expand All @@ -41,7 +40,7 @@ def ml_predict(df_rules, df_dataset_features, run_path):


fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
plot_with_lines_and_predictions(X_pca, X_pca_test, y_train, "Current Dataset", y_pred, train_dataset_names, ax, 'ML Predictions', encoder)
plot_with_lines_and_predictions(X_umap, X_umap_test, y_train, "Current Dataset", y_pred, train_dataset_names, ax, 'ML Predictions', encoder)
plt.savefig(run_path / "Prediction_ml.png")
return encoder.inverse_transform(y_pred.ravel())[0]

Expand Down
Loading

0 comments on commit 1f58486

Please sign in to comment.