-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgitscraper.py
182 lines (150 loc) · 5.54 KB
/
gitscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Primary function that scrapes all the features from each repository.
# Calls gitfeatures.py which contains many of the individual features.
import json
import gitfeatures as gf
import signal
import numpy as np
class TimeoutException(Exception): # Custom exception class
pass
def timeout_handler(signum, frame): # Custom signal handler
raise TimeoutException
class Github_Profile:
"""
Class containing the stats for each Github Profile
"""
def __init__(self):
self.user = ''
self.url = ''
# metrics
self.commit_history = []
self.commits_per_time = 0
self.n_commits = 0
self.n_stars = 0
self.n_forks = 0
self.test_lines = 0
self.docstring_lines = 0
self.comment_lines = 0
self.readme_lines = 0
self.code_lines = 0
self.n_pyfiles = 0
self.pep8 = {}
a = ['E1', 'E2', 'E3', 'E4', 'E5', 'E7',
'E9', 'W1', 'W2', 'W3', 'W5', 'W6']
for p in a:
self.pep8[p] = 0
def get_metrics_per_file(item, GProfile):
"""
Extract metrics from each Python file:
-comment/code ratio
-pep8 errors
-number of code lines and test lines
"""
r = gf.get_request(item['download_url'])
if r.ok:
text = r.text
# metrics
GProfile.comment_lines += gf.get_comments(text, '#', '\n')
GProfile.docstring_lines += gf.get_comments(text, '"""', '"""')
gf.get_pep8_errs(text, GProfile)
code_len = len(text.split('\n'))
GProfile.code_lines += code_len
# tests
if item['name'].lower()[:5] == 'test_' and 'assert' in text: # pytest
GProfile.test_lines += code_len
def digest_repo(repo_url, GProfile):
"""
Look through each file and directory, extract metrics from
each python file. Recursive function.
"""
r = gf.get_request('%s' % repo_url)
if r.ok:
repoItems = json.loads(r.text or r.content)
signal.signal(signal.SIGALRM, timeout_handler)
for item in repoItems:
signal.alarm(10) # skip file if takes more than 10 seconds
try:
if item['type'] == 'file' and item['name'][-3:] == '.py':
GProfile.n_pyfiles += 1
print(item['download_url'])
get_metrics_per_file(item, GProfile)
elif item['type'] == 'dir':
digest_repo(item['url'], GProfile)
except TimeoutException:
print('%s timed out, skipping!' % item['download_url'])
def get_features(item, GP):
"""
Top-level function that scrapes features for each python file
and stores it in a Github_Profile class.
"""
contents_url = '%s/contents' % item['url']
# scrape readme
gf.get_readme_length(contents_url, GP)
# scrape file-by-file stats
digest_repo(contents_url, GP)
# scrape commit history
gf.get_repo_commit_history(item, GP)
# scrape stargazers
GP.n_stars = item['stargazers_count']
# scrape forks
GP.n_forks = item['forks_count']
return GP
def get_batch_repos(repo_list_dir, output_dir):
"""
Top-level function that batch-extracts the statistics
from a collection of repositories.
"""
proc_repos = np.loadtxt(output_dir, delimiter=',', usecols=[0], dtype='str')
repos = open(repo_list_dir, 'r').read().splitlines()
# Change the behavior of SIGALRM
signal.signal(signal.SIGALRM, timeout_handler)
for repo in repos:
if repo in proc_repos:
print('already scanned %s' % repo)
continue
GP = Github_Profile()
GP.user = repo.split('repos/')[1].split('/')[0]
r = gf.get_request(repo)
if r.ok:
item = json.loads(r.text or r.content)
signal.alarm(60)
try:
if item['fork'] is False: # for now ignore forks
GP = get_features(item, GP)
# write each repo->GP to file
string = '%s, %d, %d, %d, %d, %d, %d, %d, %f, %d, %d'
data = open(output_dir, 'a')
data.write(string % (repo, GP.n_pyfiles, GP.code_lines,
GP.comment_lines, GP.docstring_lines,
GP.test_lines, GP.readme_lines,
GP.n_commits, GP.commits_per_time,
GP.n_stars, GP.n_forks))
for key in GP.pep8.keys():
data.write(', %d' % GP.pep8[key])
data.write('\n')
data.close()
except TimeoutException:
print('%s timed out, skipping!' % repo)
except:
print('skipping repo %s' % repo)
def scrape_single_repo(user_repo):
"""
Top-level function that scrapes the statistics for a single
Github repository.
"""
GP = Github_Profile()
repo = 'https://api.github.com/repos/%s' % user_repo
GP.user = repo.split('repos/')[1].split('/')[0]
r = gf.get_request(repo)
if r.ok:
item = json.loads(r.text or r.content)
signal.alarm(60)
try:
if item['fork'] is False: # for now ignore forks
GP = get_features(item, GP)
except:
print('couldnt scrape %s' % repo)
return GP
if __name__ == '__main__':
repo_dir = 'repo_data/top_stars_repos_Python.txt'
output_dir = "repo_data/top_stars_stats_Python.txt"
get_batch_repos(repo_dir, output_dir)