diff --git a/convert-to-trec.py b/convert-to-trec.py index ddc0a16..5ee88e3 100644 --- a/convert-to-trec.py +++ b/convert-to-trec.py @@ -1,6 +1,7 @@ import json from pprint import pprint import sys, getopt +from gzip import GzipFile def usage(): @@ -8,6 +9,20 @@ def usage(): " inputfile: the original JSONL file of the dataset\n\n"+ "Copyright (c) 2015 by Singal Media Ltd.") +def openfile(filepath, mode=None): + """wrapper for reading or writing to .gz file transparently + """ + if filepath.endswith('.gz'): + if mode: + return GzipFile(filepath, mode) + else: + return GzipFile(filepath) + else: + if mode: + return open(filepath, mode) + else: + return open(filepath) + def main(argv): inputfile = '' outputfile = '' @@ -30,8 +45,8 @@ def main(argv): if not outputfile: usage() sys.exit() - outfile = open(outputfile, 'w+') - with open(inputfile) as f: + outfile = openfile(outputfile, 'w+') + with openfile(inputfile) as f: for line in f: news_article=json.loads(line) #pprint(news_article)