Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
taspinar committed Jun 22, 2019
2 parents 7ec9250 + 1fe473b commit 2b0989f
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 35 deletions.
12 changes: 7 additions & 5 deletions twitterscraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,13 @@ def main():
with open(args.output, "w", encoding="utf-8") as output:
if args.csv:
f = csv.writer(output, delimiter=";")
f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"])
for x in tweets:
f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url,
x.likes, x.replies, x.retweets,
x.text, x.html])
f.writerow(["username", "fullname","user_id", "tweet_id", "tweet_url", "timestamp","timestamp_epochs",
"replies", "retweets", "likes", "is_retweet", "retweeter_username" , "retweeter_userid" ,
"retweet_id","text", "html"])
for t in tweets:
f.writerow([t.username, t.fullname,t.user_id, t.tweet_id, t.tweet_url, t.timestamp, t.timestamp_epochs,
t.replies, t.retweets, t.likes, t.is_retweet, t.retweeter_username , t.retweeter_userid ,
t.retweet_id, t.text, t.html])
else:
json.dump(tweets, output, cls=JSONEncoder)
if args.profiles and tweets:
Expand Down
21 changes: 15 additions & 6 deletions twitterscraper/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def query_single_page(query, lang, pos, retry=50, from_user=False):
:return: The list of tweets, the pos argument for getting the next page.
"""
url = get_query_url(query, lang, pos, from_user)
logger.info('Scraping tweets from {}', url)

try:
response = requests.get(url, headers=HEADER)
Expand All @@ -76,20 +77,28 @@ def query_single_page(query, lang, pos, retry=50, from_user=False):
tweets = list(Tweet.from_html(html))

if not tweets:
if json_resp:
pos = json_resp['min_position']
else:
pos = None
try:
if json_resp:
pos = json_resp['min_position']
has_more_items = json_resp['has_more_items']
if not has_more_items:
logger.info("Twitter returned : 'has_more_items' ")
return [], None
else:
pos = None
except:
pass
if retry > 0:
logger.info('Retrying... (Attempts left: {})'.format(retry))
return query_single_page(query, lang, pos, retry - 1, from_user)
else:
return [], pos

if json_resp:
return tweets, urllib.parse.quote(json_resp['min_position'])
if from_user:
return tweets, tweets[-1].id
return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
return tweets, tweets[-1].tweet_id
return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id)

except requests.exceptions.HTTPError as e:
logger.exception('HTTPError {} while requesting "{}"'.format(
Expand Down
70 changes: 46 additions & 24 deletions twitterscraper/tweet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,61 @@

@generate_ordering('timestamp', 'id', 'text', 'user', 'replies', 'retweets', 'likes')
class Tweet:
def __init__(self, user, fullname, id, url, timestamp, text, replies, retweets, likes, html):
self.user = user.strip('\@')
def __init__(self, username, fullname, user_id, tweet_id, tweet_url, timestamp, timestamp_epochs, replies, retweets,
likes, is_retweet, retweeter_username, retweeter_userid, retweet_id,text, html):
self.username = username.strip('\@')
self.fullname = fullname
self.id = id
self.url = url
self.user_id = user_id
self.tweet_id = tweet_id
self.tweet_url = tweet_url
self.timestamp = timestamp
self.text = text
self.timestamp_epochs = timestamp_epochs
self.replies = replies
self.retweets = retweets
self.likes = likes
self.is_retweet = is_retweet
self.retweeter_username = retweeter_username
self.retweeter_userid = retweeter_userid
self.retweet_id = retweet_id
self.text = text
self.html = html

@classmethod
def from_soup(cls, tweet):
return cls(
user=tweet.find('span', 'username').text or "",
fullname=tweet.find('strong', 'fullname').text or "",
id=tweet['data-item-id'] or "",
url = tweet.find('div', 'tweet')['data-permalink-path'] or "",
timestamp=datetime.utcfromtimestamp(
int(tweet.find('span', '_timestamp')['data-time'])),
text=tweet.find('p', 'tweet-text').text or "",
replies = int(tweet.find(
'span', 'ProfileTweet-action--reply u-hiddenVisually').find(
'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'),
retweets = int(tweet.find(
'span', 'ProfileTweet-action--retweet u-hiddenVisually').find(
'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'),
likes = int(tweet.find(
'span', 'ProfileTweet-action--favorite u-hiddenVisually').find(
'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0'),
html=str(tweet.find('p', 'tweet-text')) or "",
)
tweet_div = tweet.find('div', 'tweet')
username = tweet_div["data-screen-name"]
fullname = tweet_div["data-name"]
user_id = tweet_div["data-user-id"]
tweet_id = tweet_div["data-tweet-id"]
tweet_url = tweet_div["data-permalink-path"]
timestamp_epochs = int(tweet.find('span', '_timestamp')['data-time'])
timestamp = datetime.utcfromtimestamp(timestamp_epochs)
try:
retweet_id = tweet_div["data-retweet-id"]
retweeter_username = tweet_div["data-retweeter"]
retweeter_userid = tweet_div.find('a', "pretty-link js-user-profile-link")["data-user-id"]
is_retweet = 1
except:
retweet_id = ""
retweeter_username = ""
retweeter_userid = ""
is_retweet = 0

text = tweet.find('p', 'tweet-text').text or ""
replies = int(tweet.find(
'span', 'ProfileTweet-action--reply u-hiddenVisually').find(
'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0')
retweets = int(tweet.find(
'span', 'ProfileTweet-action--retweet u-hiddenVisually').find(
'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0')
likes = int(tweet.find(
'span', 'ProfileTweet-action--favorite u-hiddenVisually').find(
'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0')
html = str(tweet.find('p', 'tweet-text')) or ""

c = cls(username, fullname, user_id, tweet_id, tweet_url, timestamp, timestamp_epochs, replies, retweets, likes,
is_retweet, retweeter_username, retweeter_userid, retweet_id,text, html)
return c

@classmethod
def from_html(cls, html):
Expand Down

0 comments on commit 2b0989f

Please sign in to comment.