diff --git a/.gitignore b/.gitignore index 3a7b930..7e9868f 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,9 @@ downloads processed untarred countries +en_filtered + +all_text.txt +*.json +*.pkl +untarred_failed.txt diff --git a/countryfier_light.py b/countryfier_light.py new file mode 100644 index 0000000..5fc80c7 --- /dev/null +++ b/countryfier_light.py @@ -0,0 +1,13 @@ +import json +import os + +if __name__ == '__main__': + countries = {} + with open('all_with_sentiments.json') as f: + tweets = json.load(f) + for tweet in tweets: + if tweet['country_code'] not in countries: + countries[tweet['country_code']] = [] + countries[tweet['country_code']].append(tweet) + with open('all_by_countries.json', 'w') as f: + json.dump(countries, f) diff --git a/notes.ipynb b/notes.ipynb index 5ab2073..d5f393c 100644 --- a/notes.ipynb +++ b/notes.ipynb @@ -20,9 +20,11 @@ "\n", "45253479 characters for Feb and 637787 tweets for Feb. Meaning per message 70 characters\n", "\n", - "6250000\n", + "49.9 G of geo enabled data\n", "\n", - "49.9 G of geo enabled data" + "5554059 of tweets to analyse\n", + "\n", + "estimate 10h for wolfram to analyse all sentiments" ], "metadata": { "collapsed": false diff --git a/sentiment_analyser.nb b/sentiment_analyser.nb new file mode 100644 index 0000000..fc14075 --- /dev/null +++ b/sentiment_analyser.nb @@ -0,0 +1,133 @@ +(* Content-type: application/vnd.wolfram.mathematica *) + +(*** Wolfram Notebook File ***) +(* http://www.wolfram.com/nb *) + +(* CreatedBy='WolframDesktop 12.0' *) + +(*CacheID: 234*) +(* Internal cache information: +NotebookFileLineBreakTest +NotebookFileLineBreakTest +NotebookDataPosition[ 161, 7] +NotebookDataLength[ 4851, 125] +NotebookOptionsPosition[ 3668, 99] +NotebookOutlinePosition[ 4008, 114] +CellTagsIndexPosition[ 3965, 111] +WindowFrame->Normal*) + +(* Beginning of Notebook Content *) +Notebook[{ +Cell[BoxData[ + RowBox[{ + RowBox[{"SetDirectory", "[", + RowBox[{"NotebookDirectory", "[", "]"}], "]"}], ";"}]], "Input", + CellChangeTimes->{{3.8018292554418917`*^9, 3.801829268194729*^9}}, + CellLabel->"In[3]:=",ExpressionUUID->"7e50b431-c814-43b3-8a70-47b3f510a34f"], + +Cell[BoxData[ + RowBox[{ + RowBox[{"tweets", "=", + RowBox[{"Import", "[", + RowBox[{"\"\\"", ",", "\"\\""}], "]"}]}], + ";"}]], "Input", + CellChangeTimes->{{3.801829228874784*^9, 3.801829254017221*^9}, { + 3.80182939434164*^9, 3.801829401467689*^9}, 3.8018304300215607`*^9, { + 3.801831083869722*^9, 3.801831084036481*^9}}, + CellLabel->"In[53]:=",ExpressionUUID->"8b5b0fcf-c0e1-4988-81e7-69b45f192640"], + +Cell[BoxData[ + RowBox[{ + RowBox[{"getSentiment", ":=", + RowBox[{ + RowBox[{"Classify", "[", + RowBox[{"\"\\"", ",", + RowBox[{"#", "[", + RowBox[{"[", "\"\\"", "]"}], "]"}], ",", + "\"\\""}], "]"}], "&"}]}], ";"}]], "Input", + CellChangeTimes->{{3.801829403348021*^9, 3.801829424402649*^9}, { + 3.80182948481728*^9, 3.8018295571106853`*^9}, {3.8018297704071703`*^9, + 3.8018297721097307`*^9}, {3.8018299109776363`*^9, 3.801829913584139*^9}, { + 3.801829960631764*^9, 3.801829969853791*^9}, {3.801830124089261*^9, + 3.801830125735715*^9}, 3.8018304526045523`*^9}, + CellLabel->"In[49]:=",ExpressionUUID->"97cdedea-0864-4034-b766-46b519d70639"], + +Cell[BoxData[ + RowBox[{ + RowBox[{"sentiments", "=", + RowBox[{"getSentiment", "/@", "tweets"}]}], ";"}]], "Input", + CellChangeTimes->{{3.80183014636773*^9, 3.801830180606152*^9}}, + CellLabel->"In[40]:=",ExpressionUUID->"ae7e2788-419f-4be6-9f2a-be1d0da9426b"], + +Cell[CellGroupData[{ + +Cell[BoxData[ + RowBox[{"Export", "[", + RowBox[{"\"\\"", ",", "sentiments"}], "]"}]], "Input", + CellChangeTimes->{{3.801830165287106*^9, 3.801830216629527*^9}, { + 3.801830318257956*^9, 3.80183033794354*^9}}, + CellLabel->"In[48]:=",ExpressionUUID->"0acc16de-62e8-484b-9e46-3d58283d42b4"], + +Cell[BoxData["\<\"sentiments.json\"\>"], "Output", + CellChangeTimes->{{3.8018301897362432`*^9, 3.80183021764944*^9}, { + 3.801830325411562*^9, 3.801830338261691*^9}}, + CellLabel->"Out[48]=",ExpressionUUID->"bfbbe786-961e-479c-a40b-22f2eeae4a3e"] +}, Open ]], + +Cell[BoxData[ + RowBox[{"sentiments", ";"}]], "Input", + CellChangeTimes->{{3.8018301987179413`*^9, 3.801830200109646*^9}, { + 3.8018302662198677`*^9, 3.801830269690198*^9}}, + CellLabel->"In[46]:=",ExpressionUUID->"43846aa9-a21d-4f9e-91a6-2976cfa0bd45"], + +Cell[CellGroupData[{ + +Cell[BoxData[ + RowBox[{"Length", "[", "tweets", "]"}]], "Input", + CellChangeTimes->{{3.801831146909433*^9, 3.80183115134906*^9}}, + CellLabel->"In[54]:=",ExpressionUUID->"625a7efa-f1f3-4427-8efb-08b980b00e07"], + +Cell[BoxData["5554059"], "Output", + CellChangeTimes->{3.801831151926058*^9}, + CellLabel->"Out[54]=",ExpressionUUID->"30e2dc34-92e9-40d3-82d7-7c3799d2128f"] +}, Open ]], + +Cell[BoxData[""], "Input", + CellChangeTimes->{{3.801830310465623*^9, 3.801830313016934*^9}, + 3.801831144429473*^9},ExpressionUUID->"4ed7d1d3-68aa-457a-a389-\ +ed58fde6ec2f"] +}, +WindowSize->{808, 911}, +WindowMargins->{{832, Automatic}, {Automatic, 289}}, +FrontEndVersion->"12.0 for Mac OS X x86 (64-bit) (April 11, 2019)", +StyleDefinitions->"Default.nb" +] +(* End of Notebook Content *) + +(* Internal cache information *) +(*CellTagsOutline +CellTagsIndex->{} +*) +(*CellTagsIndex +CellTagsIndex->{} +*) +(*NotebookFileOutline +Notebook[{ +Cell[561, 20, 269, 5, 30, "Input",ExpressionUUID->"7e50b431-c814-43b3-8a70-47b3f510a34f"], +Cell[833, 27, 431, 9, 30, "Input",ExpressionUUID->"8b5b0fcf-c0e1-4988-81e7-69b45f192640"], +Cell[1267, 38, 707, 14, 30, "Input",ExpressionUUID->"97cdedea-0864-4034-b766-46b519d70639"], +Cell[1977, 54, 261, 5, 30, "Input",ExpressionUUID->"ae7e2788-419f-4be6-9f2a-be1d0da9426b"], +Cell[CellGroupData[{ +Cell[2263, 63, 306, 5, 30, "Input",ExpressionUUID->"0acc16de-62e8-484b-9e46-3d58283d42b4"], +Cell[2572, 70, 245, 3, 34, "Output",ExpressionUUID->"bfbbe786-961e-479c-a40b-22f2eeae4a3e"] +}, Open ]], +Cell[2832, 76, 251, 4, 30, "Input",ExpressionUUID->"43846aa9-a21d-4f9e-91a6-2976cfa0bd45"], +Cell[CellGroupData[{ +Cell[3108, 84, 208, 3, 30, "Input",ExpressionUUID->"625a7efa-f1f3-4427-8efb-08b980b00e07"], +Cell[3319, 89, 155, 2, 69, "Output",ExpressionUUID->"30e2dc34-92e9-40d3-82d7-7c3799d2128f"] +}, Open ]], +Cell[3489, 94, 175, 3, 30, "Input",ExpressionUUID->"4ed7d1d3-68aa-457a-a389-ed58fde6ec2f"] +} +] +*) + diff --git a/sentiment_analyser.wls b/sentiment_analyser.wls new file mode 100644 index 0000000..f1e8922 --- /dev/null +++ b/sentiment_analyser.wls @@ -0,0 +1,5 @@ +#!/usr/bin/env wolframscript +tweets = Import["all.json", "RawJSON"]; +getSentiment := Classify["Sentiment", #[["text"]], "Probabilities"] &; +sentiments = getSentiment /@ tweets; +Export["sentiments.json", sentiments]; diff --git a/sentiment_threader.py b/sentiment_threader.py new file mode 100644 index 0000000..947883e --- /dev/null +++ b/sentiment_threader.py @@ -0,0 +1,12 @@ +import json +import progressbar + +if __name__ == '__main__': + with open('all.json') as f: + tweets = json.load(f) + with open('sentiments.json') as f: + sentiments = json.load(f) + for i in progressbar.progressbar(range(len(tweets))): + tweets[i]['sentiment'] = sentiments[i] + with open('all_with_sentiments.json', 'w') as f: + json.dump(tweets, f) diff --git a/statistics_finder.nb b/statistics_finder.nb new file mode 100644 index 0000000..7e4a401 --- /dev/null +++ b/statistics_finder.nb @@ -0,0 +1,43 @@ +(* Content-type: application/vnd.wolfram.mathematica *) + +(*** Wolfram Notebook File ***) +(* http://www.wolfram.com/nb *) + +(* CreatedBy='WolframDesktop 12.0' *) + +(*CacheID: 234*) +(* Internal cache information: +NotebookFileLineBreakTest +NotebookFileLineBreakTest +NotebookDataPosition[ 161, 7] +NotebookDataLength[ 760, 33] +NotebookOptionsPosition[ 564, 21] +NotebookOutlinePosition[ 903, 36] +CellTagsIndexPosition[ 860, 33] +WindowFrame->Normal*) + +(* Beginning of Notebook Content *) +Notebook[{ +}, +WindowSize->{808, 911}, +WindowMargins->{{Automatic, 866}, {Automatic, 23}}, +FrontEndVersion->"12.0 for Mac OS X x86 (64-bit) (April 11, 2019)", +StyleDefinitions->"Default.nb" +] +(* End of Notebook Content *) + +(* Internal cache information *) +(*CellTagsOutline +CellTagsIndex->{} +*) +(*CellTagsIndex +CellTagsIndex->{} +*) +(*NotebookFileOutline +Notebook[{ +} +] +*) + +(* End of internal cache information *) + diff --git a/text_filter.py b/text_filter.py new file mode 100644 index 0000000..a7a4763 --- /dev/null +++ b/text_filter.py @@ -0,0 +1,27 @@ +import re +import json +import progressbar + +def filter_text(s): + # delete all the emoji (and any non ascii characters) + s = re.sub(r'[^\x00-\x7F]', '', s) + # remove all the @ + s = re.sub(r'@[^\s]+', '', s) + # remove all the # + s = re.sub(r'#[^\s]+', '', s) + # remove all the links + s = re.sub(r'https://[^s]+', '', s) + s = re.sub(r'http://[^s]+', '', s) + return s + + +if __name__ == '__main__': + + with open('all.json') as f: + tweets = json.load(f) + with progressbar.ProgressBar(max_value=len(tweets)) as bar: + for i, tweet in enumerate(tweets): + tweet['text'] = filter_text(tweet['text']) + bar.update(i) + with open('all_text_filtered.json', 'w') as f: + json.dump(tweets, f)