-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinstagram.py
98 lines (83 loc) · 3.92 KB
/
instagram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import instaloader
import os
import datetime
from datetime import datetime
import pandas as pd
from glob import glob
import orjson
from itertools import dropwhile, takewhile
def get_instagram_posts(username, startdate, enddate,pwd,yourusername):
# Create an instaloader object with parameters
L = instaloader.Instaloader(download_pictures = False, download_videos = False, download_comments= False, compress_json = False)
# Log in with the instaloader object
L.login(yourusername, pwd)
# Search the instagram profile
profile = instaloader.Profile.from_username(L.context, username)
# Scrape the posts
posts = profile.get_posts()
print(f"scraping {username}...")
for post in takewhile(lambda p: p.date > startdate, dropwhile(lambda p : p.date > enddate, posts)):
print(post.date)
L.download_post(post, target = profile.username)
def parse_instafiles(username, path):
"""
This function loads in all the json files generated by the instaloader package and parses it into a csv file.
"""
#print('Entering provided directory...')
os.chdir(os.path.join(path, username))
columns = ['filename', 'datetime', 'type', 'locations_id', 'locations_name', 'mentions', 'hashtags', 'video_duration']
dataframe = pd.DataFrame(columns=[])
#print('Traversing file tree...')
glob('*UTC.json')
for file in glob('*UTC.json'):
with open(file, 'r') as filecontent:
filename = filecontent.name
#print('Found JSON file: ' + filename + '. Loading...')
try:
metadata = orjson.loads(filecontent.read())
except IOError as e:
#print("I/O Error. Couldn't load file. Trying the next one...")
continue
else:
pass
#print('Collecting relevant metadata...')
time = datetime.fromtimestamp(int(metadata['node']['taken_at_timestamp']))
type_ = metadata['node']['__typename']
likes = metadata['node']['edge_media_preview_like']['count']
comments = metadata['node']['edge_media_to_comment']['count']
username = metadata['node']['owner']['username']
followers = metadata['node']['owner']['edge_followed_by']['count']
try:
text = metadata['node']['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ""
try:
post_id = metadata['node']['id']
except:
post_id = ""
minedata = {'filename': filename, 'time': time, 'text': text,
'likes': likes, 'comments' : comments, 'username' : username, 'followers' : followers, 'post_id' : post_id}
#print('Writing to dataframe...')
dataframe = dataframe.append(minedata, ignore_index=True)
#print('Closing file...')
del metadata
filecontent.close()
#print('Storing dataframe to CSV file...')
#print('Done.')
dataframe['source'] = 'Instagram'
return dataframe
yourusername = input("enter your username here")
yourpwd=input("enter your pwd here")
# instagram username you want to scrape YYYY-MM-SS
username = input("enter a username! ex: realdonaldtrump")
# daterange of scraping
startdate = (datetime.fromisoformat(input("enter a start date in the YYYY-MM-DD format")))
enddate = (datetime.fromisoformat(input("enter a end date in the YYYY-MM-DD format")))
# get your current working directory
current_wkdir = os.getcwd()
# Call the function. This will automatically store all the scrape data in a folder in your current working directory
get_instagram_posts(username, startdate, enddate,yourpwd,yourusername)
# parses and creates csv of all the posts
print("parsing and exporting to csv...")
df_instagram = parse_instafiles(username, os.getcwd() )
df_instagram.to_csv(f"{username}_instagram.csv")