Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CVS Convertor for python 3 #33

Open
mirzamo opened this issue Feb 8, 2017 · 1 comment
Open

CVS Convertor for python 3 #33

mirzamo opened this issue Feb 8, 2017 · 1 comment

Comments

@mirzamo
Copy link

mirzamo commented Feb 8, 2017

Current version of convertor is not compatible with python 3. Here is the fixed version:

`# -- coding: utf-8 --
"""Convert the Yelp Dataset Challenge dataset from json format to csv.

For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge

"""
import argparse
import collections
import csv
import json

def read_and_write_file(json_file_path, csv_file_path, column_names):
"""Read in the json dataset file and write it out to a csv file, given the column names."""
with open(csv_file_path, 'w+') as fout:
csv_file = csv.writer(fout)
csv_file.writerow(list(column_names))
with open(json_file_path) as fin:
for line in fin:
line_contents = json.loads(line)
csv_file.writerow(get_row(line_contents, column_names))

def get_superset_of_column_names_from_file(json_file_path):
"""Read in the json dataset file and return the superset of column names."""
column_names = set()
with open(json_file_path) as fin:
for line in fin:
line_contents = json.loads(line)
column_names.update(
set(get_column_names(line_contents).keys())
)
return column_names

def get_column_names(line_contents, parent_key=''):
"""Return a list of flattened key names given a dict.

Example:

    line_contents = {
        'a': {
            'b': 2,
            'c': 3,
            },
    }

    will return: ['a.b', 'a.c']

These will be the column names for the eventual csv file.

"""
column_names = []
for k, v in line_contents.items():
    column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
    if isinstance(v, collections.MutableMapping):
        column_names.extend(
                get_column_names(v, column_name).items()
                )
    else:
        column_names.append((column_name, v))
return dict(column_names)

def get_nested_value(d, key):
"""Return a dictionary item given a dictionary d and a flattened key from get_column_names.

Example:

    d = {
        'a': {
            'b': 2,
            'c': 3,
            },
    }
    key = 'a.b'

    will return: 2

"""
if '.' not in key:
    if key not in d:
        return None
    return d[key]
base_key, sub_key = key.split('.', 1)
if base_key not in d:
    return None
sub_dict = d[base_key]
return get_nested_value(sub_dict, sub_key)

def get_row(line_contents, column_names):
"""Return a csv compatible row given column names and a dict."""
row = []
for column_name in column_names:
line_value = get_nested_value(
line_contents,
column_name,
)
if isinstance(line_value, str):
row.append('{0}'.format(line_value.encode('utf-8')))
elif line_value is not None:
row.append('{0}'.format(line_value))
else:
row.append('')
return row

if name == 'main':
"""Convert a yelp dataset file from json to csv."""

parser = argparse.ArgumentParser(
        description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
        )

parser.add_argument(
        'json_file',
        type=str,
        help='The json file to convert.',
        )

args = parser.parse_args()

json_file = args.json_file
csv_file = '{0}.csv'.format(json_file.split('.json')[0])

column_names = get_superset_of_column_names_from_file(json_file)
read_and_write_file(json_file, csv_file, column_names)`
@oribarel
Copy link

If you want this code to work porperly you also need to add the argument encoding="utf8" when open the json file, i.e. open(json_file_path, encoding="utf8")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants