image_augmentation_ben_pc.py

# -*- coding: utf-8 -*-
"""image_augmentation_ben_pc.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1iHvkV1IOUOCNQH2MeZLeq6bX4--PZkXA
"""

# Commented out IPython magic to ensure Python compatibility.
# importing all the required libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import skimage.io as io
from skimage.transform import rotate, AffineTransform, warp
import os
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline

path = '' #path to save files
#dir_path_0 = r'/content/drive/MyDrive/Hackerthon_data/0_img' #path to file for scenario 0
#dir_path_1 = r'/content/drive/MyDrive/Hackerthon_data/1_img' #path to file for scenario 1
#dir_path_2 = r'/content/drive/MyDrive/Hackerthon_data/2_img' #path to file for scenario 2


df = pd.read_csv('/content/drive/MyDrive/Hackerthon_data/train.csv')
df['File_ID'] = df.example_path.str.extract('(\d+)')
df['version_no'] = np.zeros([len(df)])

df_0 = df.copy(deep=True)
df_0.drop(df_0[df_0['label']!=0].index,inplace=True)
df_0.reset_index(inplace=True)
df_1 = df.copy(deep=True)
df_1.drop(df_1[df_1['label']!=1].index,inplace=True)
df_1.reset_index(inplace=True)
df_2 = df.copy(deep=True)
df_2.drop(df_2[df_2['label']!=2].index,inplace=True)
df_2.reset_index(inplace=True)


def percentage_matcher(n_iter,df,df_0,df_1,df_2,path):
  '''change the percentages of files in each scenario to be similar to 2 dp.
  input        path0: path to scenario 0
  input        path1: path to scenario 1
  input        path2: path to scenario 2
  input       n_iter: numbner of iterations for the for loop
  input           df: dataframe of all training data
  input         df_0: dataframe of scenario 0
  input         df_1: dataframe of scenario 1
  input         df_2: dataframe of scenario 2
  input         path: path to folder location to save file e.g.,/content/drive/MyDrive/Hackerthon_data/
  return     lengths: the length of each directory
  return percentages: the percentage of evenets in each directory
  return          df: dataframe of augmented training data'''

  len_0 = len(df_0)#(len([entry for entry in os.listdir(dir_path_0) if os.path.isfile(os.path.join(dir_path_0, entry))])) #calculate the number of files in the directory
  len_1 = len(df_1)#(len([entry for entry in os.listdir(dir_path_1) if os.path.isfile(os.path.join(dir_path_1, entry))])) #calculate the number of files in the directory
  len_2 = len(df_2)#(len([entry for entry in os.listdir(dir_path_2) if os.path.isfile(os.path.join(dir_path_2, entry))])) #calculate the number of files in the directory
  lengths = np.array([len_0,len_1,len_2]) #put the starting number of flies for each scenario into an array
  percentages = (lengths)/np.sum(lengths) #calculate the starting percentages of each image in the three scenarios
  folders = np.array(['0_img','1_img','2_img'])
  
  for i in range(n_iter):
    min_length = min(lengths)
    boolarr = lengths == min_length #identify scenario with less events
    lengths[boolarr] = lengths[boolarr] + 5
    percentages = (lengths)/np.sum(lengths)
    
    if folders[boolarr] == '0_img':
      sample_img_no = np.array(df_0.sample())
    elif folders[boolarr] == '1_img':
      sample_img_no = np.array(df_1.sample())
    elif folders[boolarr] == '2_img':
      sample_img_no = np.array(df_2.sample())
    image_path = '{h}/{p}.png'.format(h=path,k=folders[boolarr][0],p=sample_img_no[0][-2])
    img = io.imread(image_path)
    io.imshow(img)
    rotations = [90,180,270]
    
    for j in range(len(rotations)):
      rotated = rotate(img, angle=rotations[j], mode = 'wrap')
      sample_img_no[0][-1] +=1
      sample_img_no_version = sample_img_no[0][1:]
      df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no'])
      df_ev['example_path'] = 'train_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1]))
      df = pd.concat([df,df_ev],ignore_index=True)
      io.imsave('{h}/{p}_{q}.png'.format(h=path,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=rotated)
    
    flipLR = np.fliplr(img)
    sample_img_no[0][-1] +=1
    sample_img_no_version = sample_img_no[0][1:]
    df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no'])
    df_ev['example_path'] = 'rain_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1]))
    df = pd.concat([df,df_ev],ignore_index=True)
    io.imsave('{h}/{p}_{q}.png'.format(h=path,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=flipLR)
    
    flipUD = np.flipud(img)
    sample_img_no[0][-1] +=1
    sample_img_no_version = sample_img_no[0][1:]
    df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no'])
    df_ev['example_path'] = 'rain_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1]))
    df = pd.concat([df,df_ev],ignore_index=True)
    io.imsave('{h}/{p}_{q}.png'.format(h=path,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=flipUD)
  return lengths, percentages, df

lengths, percentages,df = percentage_matcher(1,df,df_0,df_1,df_2)

from google.colab import drive
drive.mount('/content/drive')

df