Merge branch 'master' of https://github.com/taspinar/twitterscraper

taspinar · Jun 22, 2019 · 2b0989f · 2b0989f
2 parents 7ec9250 + 1fe473b
commit 2b0989f
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 35 deletions.
diff --git a/twitterscraper/main.py b/twitterscraper/main.py
@@ -113,11 +113,13 @@ def main():
                 with open(args.output, "w", encoding="utf-8") as output:
                     if args.csv:
                         f = csv.writer(output, delimiter=";")
-                        f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"])
-                        for x in tweets:
-                            f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url,
-                                        x.likes, x.replies, x.retweets,
-                                        x.text, x.html])
+                        f.writerow(["username", "fullname","user_id", "tweet_id", "tweet_url", "timestamp","timestamp_epochs",
+                                    "replies", "retweets", "likes", "is_retweet", "retweeter_username" , "retweeter_userid" ,
+                                    "retweet_id","text", "html"])
+                        for t in tweets:
+                            f.writerow([t.username, t.fullname,t.user_id, t.tweet_id, t.tweet_url, t.timestamp, t.timestamp_epochs,
+                                        t.replies, t.retweets, t.likes, t.is_retweet, t.retweeter_username , t.retweeter_userid ,
+                                        t.retweet_id, t.text, t.html])
                     else:
                         json.dump(tweets, output, cls=JSONEncoder)
             if args.profiles and tweets:

diff --git a/twitterscraper/query.py b/twitterscraper/query.py
@@ -59,6 +59,7 @@ def query_single_page(query, lang, pos, retry=50, from_user=False):
     :return: The list of tweets, the pos argument for getting the next page.
     """
     url = get_query_url(query, lang, pos, from_user)
+    logger.info('Scraping tweets from {}', url)
 
     try:
         response = requests.get(url, headers=HEADER)
@@ -76,20 +77,28 @@ def query_single_page(query, lang, pos, retry=50, from_user=False):
         tweets = list(Tweet.from_html(html))
 
         if not tweets:
-            if json_resp:
-                pos = json_resp['min_position']
-            else:
-                pos = None
+            try:
+                if json_resp:
+                    pos = json_resp['min_position']
+                    has_more_items = json_resp['has_more_items']
+                    if not has_more_items:
+                        logger.info("Twitter returned : 'has_more_items' ")
+                        return [], None
+                else:
+                    pos = None
+            except:
+                pass
             if retry > 0:
+                logger.info('Retrying... (Attempts left: {})'.format(retry))
                 return query_single_page(query, lang, pos, retry - 1, from_user)
             else:
                 return [], pos
 
         if json_resp:
             return tweets, urllib.parse.quote(json_resp['min_position'])
         if from_user:
-            return tweets, tweets[-1].id
-        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
+            return tweets, tweets[-1].tweet_id
+        return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id)
 
     except requests.exceptions.HTTPError as e:
         logger.exception('HTTPError {} while requesting "{}"'.format(

diff --git a/twitterscraper/tweet.py b/twitterscraper/tweet.py
@@ -6,39 +6,61 @@
 
 @generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'retweets', 'likes')
 class Tweet:
-    def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, likes, html):
-        self.user = user.strip('\@')
+    def __init__(self, username, fullname, user_id, tweet_id, tweet_url, timestamp, timestamp_epochs, replies, retweets,
+                 likes, is_retweet, retweeter_username, retweeter_userid, retweet_id,text, html):
+        self.username = username.strip('\@')
         self.fullname = fullname
-        self.id = id
-        self.url = url
+        self.user_id = user_id
+        self.tweet_id = tweet_id
+        self.tweet_url = tweet_url
         self.timestamp = timestamp
-        self.text = text
+        self.timestamp_epochs = timestamp_epochs
         self.replies = replies
         self.retweets = retweets
         self.likes = likes
+        self.is_retweet = is_retweet
+        self.retweeter_username = retweeter_username
+        self.retweeter_userid = retweeter_userid
+        self.retweet_id = retweet_id
+        self.text = text
         self.html = html
 
     @classmethod
     def from_soup(cls, tweet):
-        return cls(
-            user=tweet.find('span', 'username').text or "",
-            fullname=tweet.find('strong', 'fullname').text or "", 
-            id=tweet['data-item-id'] or "",
-            url = tweet.find('div', 'tweet')['data-permalink-path'] or "",
-            timestamp=datetime.utcfromtimestamp(
-                int(tweet.find('span', '_timestamp')['data-time'])),
-            text=tweet.find('p', 'tweet-text').text or "",
-            replies = int(tweet.find(
-                'span', 'ProfileTweet-action--reply u-hiddenVisually').find(
-                    'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'),
-            retweets = int(tweet.find(
-                'span', 'ProfileTweet-action--retweet u-hiddenVisually').find(
-                    'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'),
-            likes = int(tweet.find(
-                'span', 'ProfileTweet-action--favorite u-hiddenVisually').find(
-                    'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'),
-            html=str(tweet.find('p', 'tweet-text')) or "",
-        )
+        tweet_div = tweet.find('div', 'tweet')
+        username = tweet_div["data-screen-name"]
+        fullname = tweet_div["data-name"]
+        user_id = tweet_div["data-user-id"]
+        tweet_id = tweet_div["data-tweet-id"]
+        tweet_url = tweet_div["data-permalink-path"]
+        timestamp_epochs = int(tweet.find('span', '_timestamp')['data-time'])
+        timestamp = datetime.utcfromtimestamp(timestamp_epochs)
+        try:
+            retweet_id = tweet_div["data-retweet-id"]
+            retweeter_username = tweet_div["data-retweeter"]
+            retweeter_userid = tweet_div.find('a', "pretty-link js-user-profile-link")["data-user-id"]
+            is_retweet = 1
+        except:
+            retweet_id = ""
+            retweeter_username = ""
+            retweeter_userid = ""
+            is_retweet = 0
+
+        text = tweet.find('p', 'tweet-text').text or ""
+        replies = int(tweet.find(
+            'span', 'ProfileTweet-action--reply u-hiddenVisually').find(
+            'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0')
+        retweets = int(tweet.find(
+            'span', 'ProfileTweet-action--retweet u-hiddenVisually').find(
+            'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0')
+        likes = int(tweet.find(
+            'span', 'ProfileTweet-action--favorite u-hiddenVisually').find(
+            'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0')
+        html = str(tweet.find('p', 'tweet-text')) or ""
+
+        c = cls(username, fullname, user_id, tweet_id, tweet_url, timestamp, timestamp_epochs, replies, retweets, likes,
+                 is_retweet, retweeter_username, retweeter_userid, retweet_id,text, html)
+        return c
 
     @classmethod
     def from_html(cls, html):