diff --git a/twitterscraper/main.py b/twitterscraper/main.py index c847639..954c561 100644 --- a/twitterscraper/main.py +++ b/twitterscraper/main.py @@ -113,11 +113,13 @@ def main(): with open(args.output, "w", encoding="utf-8") as output: if args.csv: f = csv.writer(output, delimiter=";") - f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"]) - for x in tweets: - f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url, - x.likes, x.replies, x.retweets, - x.text, x.html]) + f.writerow(["username", "fullname","user_id", "tweet_id", "tweet_url", "timestamp","timestamp_epochs", + "replies", "retweets", "likes", "is_retweet", "retweeter_username" , "retweeter_userid" , + "retweet_id","text", "html"]) + for t in tweets: + f.writerow([t.username, t.fullname,t.user_id, t.tweet_id, t.tweet_url, t.timestamp, t.timestamp_epochs, + t.replies, t.retweets, t.likes, t.is_retweet, t.retweeter_username , t.retweeter_userid , + t.retweet_id, t.text, t.html]) else: json.dump(tweets, output, cls=JSONEncoder) if args.profiles and tweets: diff --git a/twitterscraper/query.py b/twitterscraper/query.py index 21a42ff..c156dcb 100644 --- a/twitterscraper/query.py +++ b/twitterscraper/query.py @@ -59,6 +59,7 @@ def query_single_page(query, lang, pos, retry=50, from_user=False): :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) + logger.info('Scraping tweets from {}', url) try: response = requests.get(url, headers=HEADER) @@ -76,11 +77,19 @@ def query_single_page(query, lang, pos, retry=50, from_user=False): tweets = list(Tweet.from_html(html)) if not tweets: - if json_resp: - pos = json_resp['min_position'] - else: - pos = None + try: + if json_resp: + pos = json_resp['min_position'] + has_more_items = json_resp['has_more_items'] + if not has_more_items: + logger.info("Twitter returned : 'has_more_items' ") + return [], None + else: + pos = None + except: + pass if retry > 0: + logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos @@ -88,8 +97,8 @@ def query_single_page(query, lang, pos, retry=50, from_user=False): if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: - return tweets, tweets[-1].id - return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) + return tweets, tweets[-1].tweet_id + return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format( diff --git a/twitterscraper/tweet.py b/twitterscraper/tweet.py index f477b8b..8339619 100644 --- a/twitterscraper/tweet.py +++ b/twitterscraper/tweet.py @@ -6,39 +6,61 @@ @generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'retweets', 'likes') class Tweet: - def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, likes, html): - self.user = user.strip('\@') + def __init__(self, username, fullname, user_id, tweet_id, tweet_url, timestamp, timestamp_epochs, replies, retweets, + likes, is_retweet, retweeter_username, retweeter_userid, retweet_id,text, html): + self.username = username.strip('\@') self.fullname = fullname - self.id = id - self.url = url + self.user_id = user_id + self.tweet_id = tweet_id + self.tweet_url = tweet_url self.timestamp = timestamp - self.text = text + self.timestamp_epochs = timestamp_epochs self.replies = replies self.retweets = retweets self.likes = likes + self.is_retweet = is_retweet + self.retweeter_username = retweeter_username + self.retweeter_userid = retweeter_userid + self.retweet_id = retweet_id + self.text = text self.html = html @classmethod def from_soup(cls, tweet): - return cls( - user=tweet.find('span', 'username').text or "", - fullname=tweet.find('strong', 'fullname').text or "", - id=tweet['data-item-id'] or "", - url = tweet.find('div', 'tweet')['data-permalink-path'] or "", - timestamp=datetime.utcfromtimestamp( - int(tweet.find('span', '_timestamp')['data-time'])), - text=tweet.find('p', 'tweet-text').text or "", - replies = int(tweet.find( - 'span', 'ProfileTweet-action--reply u-hiddenVisually').find( - 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'), - retweets = int(tweet.find( - 'span', 'ProfileTweet-action--retweet u-hiddenVisually').find( - 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'), - likes = int(tweet.find( - 'span', 'ProfileTweet-action--favorite u-hiddenVisually').find( - 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'), - html=str(tweet.find('p', 'tweet-text')) or "", - ) + tweet_div = tweet.find('div', 'tweet') + username = tweet_div["data-screen-name"] + fullname = tweet_div["data-name"] + user_id = tweet_div["data-user-id"] + tweet_id = tweet_div["data-tweet-id"] + tweet_url = tweet_div["data-permalink-path"] + timestamp_epochs = int(tweet.find('span', '_timestamp')['data-time']) + timestamp = datetime.utcfromtimestamp(timestamp_epochs) + try: + retweet_id = tweet_div["data-retweet-id"] + retweeter_username = tweet_div["data-retweeter"] + retweeter_userid = tweet_div.find('a', "pretty-link js-user-profile-link")["data-user-id"] + is_retweet = 1 + except: + retweet_id = "" + retweeter_username = "" + retweeter_userid = "" + is_retweet = 0 + + text = tweet.find('p', 'tweet-text').text or "" + replies = int(tweet.find( + 'span', 'ProfileTweet-action--reply u-hiddenVisually').find( + 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0') + retweets = int(tweet.find( + 'span', 'ProfileTweet-action--retweet u-hiddenVisually').find( + 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0') + likes = int(tweet.find( + 'span', 'ProfileTweet-action--favorite u-hiddenVisually').find( + 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0') + html = str(tweet.find('p', 'tweet-text')) or "" + + c = cls(username, fullname, user_id, tweet_id, tweet_url, timestamp, timestamp_epochs, replies, retweets, likes, + is_retweet, retweeter_username, retweeter_userid, retweet_id,text, html) + return c @classmethod def from_html(cls, html):