-
Notifications
You must be signed in to change notification settings - Fork 0
/
Text_Cleaning_Uni_Data.py
56 lines (46 loc) · 2.38 KB
/
Text_Cleaning_Uni_Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from cleantext import clean
import os
import re
import json
import pandas as pd
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
def clean_data(text):
'''
input: string - text:
This function takes in text in english to be cleaned - scrapped from univeristy websites. It removed anything between brackets
it removes urls, emails and phone numbers. It removed punctuation line breaks. it fixes any unicode character issues
and changes everything to ASCII coding.
output: String - text:
'''
text=clean(text=text,fix_unicode=True,to_ascii=True,lower=False,no_line_breaks=True,no_urls=True,no_emails=True,
no_phone_numbers=True,no_numbers=False,no_digits=False,no_currency_symbols=False,no_punct=True,
replace_with_url="<URL>",replace_with_email="<EMAIL>",replace_with_phone_number="<PHONENO>",lang="en")
text=re.sub("[\(\[\{}].*?[\)\]\}]", "", text)
text=re.sub("[;]", "", text)
return text
def get_data_and_cleaned():
'''
This function reads scraped data from JSON files in the "ScrappedData" directory and cleans the text data.
It converts the cleaned data into a CSV file with each course of each program as an individual entry in the CSV.
Input: None
Output: None
'''
# clean all the text desc data from all universities in data folder, and turn into a single CSV
raw_uni_data=[]
for filename in os.listdir(os.getcwd()+"/ScrappedData"):
if filename.endswith(".json"):
#print(filename)
with open(os.path.join(os.getcwd()+"/ScrappedData", filename), 'r') as f:
data=json.load(f)
raw_uni_data.extend(data)
raw_uni_data=pd.DataFrame(raw_uni_data)
# turn csv of jsons for each university, into csv of each course of each program, being an individual entry in CSV
data=raw_uni_data.explode('Courses').reset_index(drop=True)
data.Courses = data.Courses.fillna({i: {} for i in data.index})
data1= pd.concat([data.drop('Courses', axis=1), pd.DataFrame(data['Courses'].tolist())], axis=1)
data1['clean_desc'] = data1['desc'].apply(lambda x: clean_data(x))
data1['clean_ProgDesc'] = data1['ProgDesc'].apply(lambda x: clean_data(x))
#this is only for sample data, actual; data is given in folder above with the name data.csv
data1.to_csv("data(1).csv", sep='|', encoding='utf-8')