Skip to content

Commit

Permalink
Merge pull request #87 from kookmin-sw/develop
Browse files Browse the repository at this point in the history
20230519 Master 브랜치 병합
  • Loading branch information
young43 authored May 19, 2023
2 parents e7c46f4 + 6976aa5 commit 6daca42
Show file tree
Hide file tree
Showing 135 changed files with 3,169 additions and 17,586 deletions.
377 changes: 313 additions & 64 deletions DM/app.py

Large diffs are not rendered by default.

30 changes: 23 additions & 7 deletions DM/csv_to_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,16 @@
import pandas as pd
from collections import Counter
from datetime import datetime
import pickle

from konlpy.tag import Komoran, Okt, Mecab
from database import Database
import platform

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity


#데이터 전처리 함수
def preprocessing(review):
Expand Down Expand Up @@ -55,6 +60,8 @@ def preprocessing(review):
# Database에서 country id 가져오기
db = Database()

country_word_list = []

for file in file_list:
country_name = file.split('_')[1]
res = db.select(f'select id from country where name="{country_name}"')
Expand All @@ -64,6 +71,7 @@ def preprocessing(review):
country_id = res[0][0]

# pandas csv 파일 읽기
# Buffer overflow 관련 오류로 lineterminator 파라미터 추가
data = pd.read_csv(crawl_path + file)
word_set = []

Expand All @@ -76,24 +84,32 @@ def preprocessing(review):
continue
print(e)


# print(wc)
wc = dict(Counter(word_set).most_common())

wc = dict(filter(lambda x:x[1] > 10, wc.items())) # 10번 이상 들어간 값만 추출
# print(wc)
# print(f"{country_id}_{country_name} : LENGTH={len(str(wc))}")
# print("="*50)
country_word_list.append(" ".join(word_set)) # pickle로 저장할 데이터

print(f"{country_id}_{country_name} : LENGTH={len(str(wc))}")
print("=" * 50)

# Database 데이터 insert (값이 있으면 UPDATE)
cur_time = datetime.today().strftime("%Y/%m/%d %H:%M:%S")
query = f'INSERT INTO country_data VALUES({country_id}, "{str(wc)}", now())' \
f'ON DUPLICATE KEY UPDATE id="{country_id}", contents="{str(wc)}", upload_time=now();'

db.query(query)
# db.query(query)

db.close()


# TF-IDF벡터 pickle 파일로 저장
vectorizer = TfidfVectorizer(max_features=500) # 상위 500단어 추출
tfidf_matrix = vectorizer.fit_transform(country_word_list)

db.close()
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim = np.array(cosine_sim)

# Cosine 벡터 pickle로 저장
with open('data.pickle', 'wb') as f:
pickle.dump(cosine_sim, f)
print('data.pickle 저장 완료')
Binary file added DM/data.pickle
Binary file not shown.
113 changes: 76 additions & 37 deletions DM/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,42 +1,81 @@
aniso8601==9.0.1
asn1crypto==0.24.0
attrs==22.2.0
beautifulsoup4==4.12.0
click==8.0.4
cryptography==2.1.4
dataclasses==0.8
Flask==2.0.3
attrs==19.3.0
Automat==0.8.0
blinker==1.6.2
certifi==2019.11.28
chardet==3.0.4
click==8.1.3
cloud-init==23.1.2
colorama==0.4.3
command-not-found==0.3
configobj==5.0.6
constantly==15.1.0
cryptography==2.8
dbus-python==1.2.16
distro==1.4.0
distro-info===0.23ubuntu1
ec2-hibinit-agent==1.0.0
entrypoints==0.3
Flask==2.3.1
Flask-Cors==3.0.10
flask-restx==1.1.0
idna==2.6
importlib-metadata==4.8.3
itsdangerous==2.0.1
Jinja2==3.0.3
joblib==1.1.1
JPype1==1.3.0
JPype1-py3==0.5.5.4
jsonschema==4.0.0
keyring==10.6.0
keyrings.alt==3.0
konlpy==0.6.0
lxml==4.9.2
MarkupSafe==2.0.1
mecab-python===0.996-ko-0.9.2
numpy==1.19.5
pandas==1.1.5
pycrypto==2.6.1
pygobject==3.26.1
PyMySQL==1.0.2
pyrsistent==0.18.0
python-dateutil==2.8.2
python-dotenv==0.20.0
hibagent==1.0.1
httplib2==0.14.0
hyperlink==19.0.0
idna==2.8
importlib-metadata==6.6.0
incremental==16.10.1
itsdangerous==2.1.2
Jinja2==3.1.2
joblib==1.2.0
jsonpatch==1.22
jsonpointer==2.0
jsonschema==3.2.0
keyring==18.0.1
language-selector==0.1
launchpadlib==1.10.13
lazr.restfulclient==0.14.2
lazr.uri==1.0.3
MarkupSafe==2.1.2
more-itertools==4.2.0
netifaces==0.10.4
numpy==1.24.3
oauthlib==3.1.0
pexpect==4.6.0
pyasn1==0.4.2
pyasn1-modules==0.2.1
PyGObject==3.36.0
PyHamcrest==1.9.0
PyJWT==1.7.1
pymacaroons==0.13.0
PyMySQL==1.0.3
PyNaCl==1.3.0
pyOpenSSL==19.0.0
pyrsistent==0.15.5
pyserial==3.4
python-apt==2.0.1+ubuntu0.20.4.1
python-debian===0.1.36ubuntu1
python-dotenv==1.0.0
pytz==2023.3
pyxdg==0.25
scikit-learn==0.24.2
scipy==1.5.4
PyYAML==5.3.1
requests==2.22.0
requests-unixsocket==0.2.0
scikit-learn==1.2.2
scipy==1.10.1
SecretStorage==2.3.1
six==1.16.0
soupsieve==2.3.2.post1
service-identity==18.1.0
simplejson==3.16.0
six==1.14.0
sos==4.4
ssh-import-id==5.10
systemd-python==234
threadpoolctl==3.1.0
typing-extensions==4.1.1
Werkzeug==2.0.3
zipp==3.6.0
Twisted==18.9.0
ubuntu-advantage-tools==8001
ufw==0.36
unattended-upgrades==0.1
urllib3==1.25.8
wadllib==1.3.3
Werkzeug==2.3.2
zipp==1.0.0
zope.interface==4.7.1
Loading

0 comments on commit 6daca42

Please sign in to comment.