add tweets to each paper, I think i like it

karpathy · Apr 4, 2020 · fdeb562 · fdeb562
1 parent 1ef7833
commit fdeb562
Show file tree

Hide file tree

Showing 5 changed files with 161 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -47,6 +47,10 @@ And in my `crontab -l` I make sure this runs every 1 hour, for example:
 3 * * * * /root/covid-sanity/pull.sh > /root/cron.log 2>&1
 ```
 
+## seeing tweets
+
+Seeing the tweets for each paper is purely optional. To achieve this you need to follow the instructions on setting up [python-twitter API](https://python-twitter.readthedocs.io/en/latest/) and then write your secrets into a file `twitter.txt`, which get loaded in `twitter_daemon.py`. I run this daemon process in a screen session where it pulls tweets for every paper in circles and saves the results.
+
 ## License
 
 MIT
diff --git a/serve.py b/serve.py
@@ -2,6 +2,7 @@
 Simple flask server for the interface
 """
 
+import os
 import json
 
 from flask import Flask, request, redirect, url_for
@@ -23,13 +24,23 @@
 with open('search.json', 'r') as f:
     search_dict = json.load(f)
 
+# OPTIONAL: load tweet dictionary, if twitter_daemon has run
+tweets_dict = {}
+if os.path.isfile('tweets.json'):
+    with open('tweets.json', 'r') as f:
+        tweets_dict = json.load(f)
+# decorate each paper with tweets
+for j in jall['rels']:
+    j['tweets'] = tweets_dict.get(j['rel_doi'], [])
+    j['tweets'].sort(key=lambda t: t['followers'], reverse=True)
+
 # do some precomputation since we're going to be doing lookups of doi -> doc index
 doi_to_ix = {}
 for i, j in enumerate(jall['rels']):
     doi_to_ix[j['rel_doi']] = i
 
 # -----------------------------------------------------------------------------
-# routes below
+# few helper functions for routes
 
 def default_context(papers, **kwargs):
     """ build a default context for the frontend """
@@ -38,6 +49,9 @@ def default_context(papers, **kwargs):
     context = {'papers': papers, 'gvars': gvars}
     return context
 
+# -----------------------------------------------------------------------------
+# routes below
+
 @app.route("/search", methods=['GET'])
 def search():
     q = request.args.get('q', '') # get the search request

diff --git a/static/paper_list.js b/static/paper_list.js
@@ -1,8 +1,24 @@
 'use strict';
 
+const Tweet = props => {
+  const t = props.tweet;
+  const turl = "https://twitter.com/" + t.name + "/status/" + t.id;
+  return (
+    <div class='tweet'>
+      <a href={turl}><img src={t.image_url}></img></a>
+      <div class='meta'>
+        <span class="following">{t.followers}</span>
+        <span class="uname"><a href={turl}>{t.name}</a></span>
+        <span class="text">{t.text}</span>
+      </div>
+    </div>
+  )
+}
+
 const Paper = props => {
   const p = props.paper
   const url = p.rel_link + '.full.pdf';
+  const tlst = p.tweets.map((jtweet, ix) => <Tweet key={ix} tweet={jtweet} />);
   return (
     <div class={'rel_paper ' + p.rel_site}>
       <div class='dllinks'>
@@ -13,6 +29,7 @@ const Paper = props => {
       <div class='rel_title'><a href={p.rel_link}>{p.rel_title}</a></div>
       <div class='rel_authors'>{p.rel_authors}</div>
       <div class='rel_abs'>{p.rel_abs}</div>
+      <div class='rel_tweets'>{tlst}</div>
     </div>
   )
 }

diff --git a/static/style.css b/static/style.css
@@ -92,9 +92,56 @@ h1 {
     background-color: #EFE;
     padding: 10px;
     margin-top: 10px;
+    border-radius: 5px 5px 0px 0px;
+}
+
+.rel_tweets {
+}
+
+.tweet {
+    background-color: #EEF;
+    font-family: Arial, Helvetica, sans-serif;
+    font-size: 14px;
     border-radius: 5px;
+    padding: 5px;
+    margin-top: 5px;
+    min-height: 48px;
+}
+
+.tweet img {
+    float: left;
+    margin-right: 5px;
+    border-radius: 5px;
+}
+
+.tweet .meta {
+    margin-top: 3px;
 }
 
+.tweet .following {
+    background-color: #0e4c92;
+    padding: 3px 5px 3px 5px;
+    border-radius: 3px 0px 0px 3px;
+    color: white;
+}
+
+.tweet .uname {
+    padding: 3px 5px 3px 5px;
+    background-color: #6a99d4;
+    color: white;
+    border-radius: 0px 3px 3px 0px;
+    margin-right: 5px;
+}
+
+.tweet .uname a {
+    color: white;
+}
+
+.tweet .text {
+    line-height: 18px;
+}
+
+
 #info {
     background-color: #EEF;
     padding: 10px;

diff --git a/twitter_daemon.py b/twitter_daemon.py
@@ -0,0 +1,78 @@
+"""
+Continuously iterates over existing database and pulls in tweets for each paper.
+"""
+
+import time
+import json
+import urllib
+
+import twitter # pip install python-twitter
+
+from run import write_json
+
+# -----------------------------------------------------------------------------
+
+def get_api_keys():
+    lines = open('twitter.txt', 'r').read().splitlines()
+    return lines
+
+def process_tweet(r):
+    tweet = {}
+    tweet['id'] = str(r.id)
+    tweet['name'] = r.user.screen_name
+    tweet['image_url'] = r.user.profile_image_url
+    tweet['followers'] = r.user.followers_count
+    tweet['verified'] = r.user.verified
+    tweet['text'] = r.full_text
+    return tweet
+
+def get_tweets(j):
+
+    # note: we're assuming v1, which is kinda sketchy and slightly wrong...
+    q = f"https://www.{j['rel_site']}.org/content/{j['rel_doi']}v1"
+    q = urllib.parse.quote(q, safe='')
+    exclude_replies = '%20-filter%3Areplies'
+    exclude_retweets = '%20-filter%3Aretweets'
+    suffix = exclude_replies + exclude_retweets
+    results = api.GetSearch(raw_query="q=%s%s&result_type=recent&count=100" % (q, suffix)) # rate limit: 1 per 5 seconds
+
+    # extract just what we need from tweets and not much more
+    jtweets = [process_tweet(r) for r in results]
+
+    # ban a few simple aggregator accounts
+    banned = ['medrxivpreprint', 'biorxivpreprint', 'glycopreprint']
+    jtweets = [t for t in jtweets if t['name'] not in banned]
+
+    return jtweets
+
+# -----------------------------------------------------------------------------
+
+if __name__ == '__main__':
+
+    keys = get_api_keys()
+    api = twitter.Api(consumer_key=keys[0],
+                      consumer_secret=keys[1],
+                      access_token_key=keys[2],
+                      access_token_secret=keys[3],
+                      tweet_mode='extended')
+
+    # run forever
+    while True:
+
+        # open the latest state of database
+        with open('jall.json', 'r') as f:
+            jall = json.load(f)
+
+        # get all tweets for all papers
+        tweets = {}
+        for i, j in enumerate(jall['rels']):
+            jtweets = get_tweets(j)
+            tweets[j['rel_doi']] = jtweets
+            print('%d/%d: found %d tweets for %s' % (i+1, len(jall['rels']), len(jtweets), j['rel_link']))
+            # rate limit is 180 calls per 5 minutes, or 1 call per 5 seconds. so sleep 7
+            time.sleep(10)
+
+        # save to file when done
+        write_json(tweets, 'tweets.json')
+        print('-'*80)
+