data-preprocessing-hccho.txt

"""
https://scikit-learn.org/stable/modules/preprocessing.html
"""
############################################################
# 
data = {'type': ['softdrink', 'juice', 'softdrink', 'softdrink',    'juice','juice','juice'],
    'product': [np.nan,'orange', 'pepsi', 'pepsi', 'orange','grape',np.nan], 
    'price': [25, 94, 57, 62, 70,50,60]}



############################################################
df.info()
df['col_name'].value_counts()  # categorical data의 상세 내용
df.isnull().sum()  # null 정보

df.head(10) # 상위 몇 줄 


############################################################

from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler

x = np.arange(20).reshape(5,4).astype(np.float32)
y =  np.arange(16).reshape(4,4).astype(np.float32)
print(np.mean(x,axis=0))


scaler_type=4
if scaler_type==1:
    scaler = StandardScaler()
    x = scaler.fit_transform( x )   # (x-m)/s
    print("params", scaler.mean_, scaler.scale_,scaler.var_,scaler.n_samples_seen_)
    
    y = scaler.transform( y )   # (y-m)/s  x에 적용했던 m,s를 적용
elif scaler_type==2:
    scaler = MinMaxScaler()
    x = scaler.fit_transform( x )   
    y = scaler.transform( y )   
    print(x,y)
elif scaler_type==3:
    scaler = MaxAbsScaler()
    x = scaler.fit_transform( x )   
    y = scaler.transform( y ) 
    print(x,y)
elif scaler_type==4:
    scaler = MaxAbsScaler()
    x = scaler.fit_transform( x )   
    y = scaler.transform( y ) 
    print(x,y) 




############################################################

# pandas에서 one-hot 변환
import pandas as pd
x = np.arange(20).reshape(5,4)

df = pd.DataFrame({'sex': ['male','female','female','male'],'age': [22, 23, 53, 13],'birth': [2018, 2001, 1985, 1971], 'from': ['seoul','pusan','seoul','jinju']})
dg = pd.get_dummies(df)   # get_dummies:  Convert categorical variable into dummy/indicator variables.
dg2 = pd.get_dummies(df['sex'])
dg3 = pd.get_dummies(df,columns=['sex'])

# type 변환
dg3['age'] = dg3['age'].astype(str)
dg3['age'] = dg3['age'].astype(np.int32)

############################################################
# DataFrame concat(row wise)

s1 = pd.DataFrame([0, 1], index=['A', 'B'])
s2 = pd.DataFrame([2, 3, 4], index=['A', 'B', 'C'])
s = pd.concat([s1, s2])
ss = s.reset_index(drop=True)  # drop=True 옵션을 주면 기존 index가 없으지고, 새로운 index가 생긴다.
#sss = ss.drop(columns=['index'])  # drop=True을 주지 않았을 경우, 'index' 컬럼을 지운다.
############################################################

data: 첫번째 column에 일련번호가 있고, tab으로 구분되어 있다.
1	I didn't know it was police housing," officers quoted Tsuchida as saying.
2	You would be a great client for Southern Indiana Homeownership's credit counseling but you are saying to yourself "Oh, we can pay that off."
3	He believes the 21st century will be the "century of biology" just as the 20th century was the century of IT.

def preprocess_hccho():
    import pandas as pd
    input_file = 'D:/NLP/Datasets/eng_news_2005_300K/eng_news_2005_300K-sentences.txt'
    output_file = 'D:/NLP/Datasets/eng_news_2005_300K/processed.txt'
    
    data = pd.read_csv(input_file, sep="\t",header = None)
    
    print(data.head(10))

    data = data[1].apply( lambda x: re.sub(r'''["`']''','',x))   # r'[^\w\s]'
    
    data.to_csv(output_file, index=False, header=False,sep='\t')

    print(data.head(10))

    print('Done')


############################################################

############################################################


############################################################

############################################################




############################################################

############################################################