forked from ypochien/TaiwanStockBSR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbatch_cp950_utf8.py
38 lines (33 loc) · 875 Bytes
/
batch_cp950_utf8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# -*- coding: utf-8 -*-
import os
import csv
import re
import pandas as pd
from pandas import Series, DataFrame
import chardet
#force decode for single string
def force_decode(string, codecs=['utf8', 'cp950']):
for i in codecs:
try:
return string.decode(i)
except:
pass
return 'none'
siteDict = {}
for dirPath, dirNames, fileNames in os.walk('BSR'):
print '#file:' + str(len(fileNames))
for i,f in enumerate(fileNames[:]):
if f[0] == '.':
continue
else:
fullfile = os.path.join(dirPath, f)
#print i
#check for csv encoding
rawdata = open(fullfile, "r").read()
result = chardet.detect(rawdata)
charenc = result['encoding']
#decode cp950 csv file to unicode with pandas
if charenc != 'utf-8':
df = pd.read_csv(fullfile, encoding='cp950')
df.to_csv(fullfile, encoding='utf-8', index=False)
print 're-file @' + f