-
Notifications
You must be signed in to change notification settings - Fork 0
/
classbalance.py
33 lines (25 loc) · 1.06 KB
/
classbalance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
# Load the data
data = pd.read_csv('train (1).csv')
# Separate features and labels
X = data['Review'].values.reshape(-1, 1) # reshape X to 2D array with a single column
y = data['Rating']
# Print class distribution before balancing
print('Class distribution before balancing:')
print(y.value_counts())
# Undersample the majority class
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)
# Oversample the minority class
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_resampled, y_resampled)
# Print class distribution after balancing
print('Class distribution after balancing:')
print(y_resampled.value_counts())
# Create a new DataFrame with the resampled data
resampled_data = pd.DataFrame({'Review': X_resampled.ravel(), 'Rating': y_resampled})
# Save the resampled data to a new CSV file
resampled_data.to_csv('balanced_train.csv', index=False)