-
Notifications
You must be signed in to change notification settings - Fork 1
/
data-preprocessing-hccho.txt
126 lines (78 loc) · 3.9 KB
/
data-preprocessing-hccho.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
https://scikit-learn.org/stable/modules/preprocessing.html
"""
############################################################
#
data = {'type': ['softdrink', 'juice', 'softdrink', 'softdrink', 'juice','juice','juice'],
'product': [np.nan,'orange', 'pepsi', 'pepsi', 'orange','grape',np.nan],
'price': [25, 94, 57, 62, 70,50,60]}
############################################################
df.info()
df['col_name'].value_counts() # categorical data의 상세 내용
df.isnull().sum() # null 정보
df.head(10) # 상위 몇 줄
############################################################
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler
x = np.arange(20).reshape(5,4).astype(np.float32)
y = np.arange(16).reshape(4,4).astype(np.float32)
print(np.mean(x,axis=0))
scaler_type=4
if scaler_type==1:
scaler = StandardScaler()
x = scaler.fit_transform( x ) # (x-m)/s
print("params", scaler.mean_, scaler.scale_,scaler.var_,scaler.n_samples_seen_)
y = scaler.transform( y ) # (y-m)/s x에 적용했던 m,s를 적용
elif scaler_type==2:
scaler = MinMaxScaler()
x = scaler.fit_transform( x )
y = scaler.transform( y )
print(x,y)
elif scaler_type==3:
scaler = MaxAbsScaler()
x = scaler.fit_transform( x )
y = scaler.transform( y )
print(x,y)
elif scaler_type==4:
scaler = MaxAbsScaler()
x = scaler.fit_transform( x )
y = scaler.transform( y )
print(x,y)
############################################################
# pandas에서 one-hot 변환
import pandas as pd
x = np.arange(20).reshape(5,4)
df = pd.DataFrame({'sex': ['male','female','female','male'],'age': [22, 23, 53, 13],'birth': [2018, 2001, 1985, 1971], 'from': ['seoul','pusan','seoul','jinju']})
dg = pd.get_dummies(df) # get_dummies: Convert categorical variable into dummy/indicator variables.
dg2 = pd.get_dummies(df['sex'])
dg3 = pd.get_dummies(df,columns=['sex'])
# type 변환
dg3['age'] = dg3['age'].astype(str)
dg3['age'] = dg3['age'].astype(np.int32)
############################################################
# DataFrame concat(row wise)
s1 = pd.DataFrame([0, 1], index=['A', 'B'])
s2 = pd.DataFrame([2, 3, 4], index=['A', 'B', 'C'])
s = pd.concat([s1, s2])
ss = s.reset_index(drop=True) # drop=True 옵션을 주면 기존 index가 없으지고, 새로운 index가 생긴다.
#sss = ss.drop(columns=['index']) # drop=True을 주지 않았을 경우, 'index' 컬럼을 지운다.
############################################################
data: 첫번째 column에 일련번호가 있고, tab으로 구분되어 있다.
1 I didn't know it was police housing," officers quoted Tsuchida as saying.
2 You would be a great client for Southern Indiana Homeownership's credit counseling but you are saying to yourself "Oh, we can pay that off."
3 He believes the 21st century will be the "century of biology" just as the 20th century was the century of IT.
def preprocess_hccho():
import pandas as pd
input_file = 'D:/NLP/Datasets/eng_news_2005_300K/eng_news_2005_300K-sentences.txt'
output_file = 'D:/NLP/Datasets/eng_news_2005_300K/processed.txt'
data = pd.read_csv(input_file, sep="\t",header = None)
print(data.head(10))
data = data[1].apply( lambda x: re.sub(r'''["`']''','',x)) # r'[^\w\s]'
data.to_csv(output_file, index=False, header=False,sep='\t')
print(data.head(10))
print('Done')
############################################################
############################################################
############################################################
############################################################
############################################################
############################################################