Skip to content

Commit

Permalink
optimised geo_filter.py
Browse files Browse the repository at this point in the history
  • Loading branch information
DE0CH committed Jun 22, 2020
1 parent 5533e58 commit 417732e
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
4 changes: 1 addition & 3 deletions geo_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ def process_files(path, file_name, geo_filtered_dict):
for line in lines:
if not line.strip():
continue
tweet = json.loads(line)
if tweet.get('place', None) is not None and tweet['place']['country_code']:
if '"country_code"' in line:
out_file.write(line)
logging.info('finished ' + os.path.join(path, file_name))
geo_filtered_dict[os.path.join(path, file_name)] = True
Expand Down Expand Up @@ -54,7 +53,6 @@ def process_files(path, file_name, geo_filtered_dict):
p = multiprocessing.Process(target=worker, args=(q, geo_filtered_dict))
processes.append(p)
p.start()
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'twitter-sentiment-analysis-f22ce784b0a8.json'
for path, dirs, files in os.walk('untarred'):
for file_name in files:
if not os.path.join(path, file_name) in geo_filtered_dict and file_name.endswith('.json.bz2'):
Expand Down
7 changes: 5 additions & 2 deletions notes.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
"\n",
"45253479 characters for Feb and 637787 tweets for Feb. Meaning per message 70 characters\n",
"\n",
"6250000"
"6250000\n",
"\n",
"49.9 G of geo enabled data"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -54,7 +56,8 @@
"To upload from EC2\n",
"```\n",
"aws s3 sync processed s3://dtwa/processed\n",
"```"
"```\n",
"\n"
],
"metadata": {
"collapsed": false
Expand Down

0 comments on commit 417732e

Please sign in to comment.