28
28
require 'fileutils'
29
29
require 'slop'
30
30
require 'octokit'
31
-
32
- max = 1000
31
+ require 'date'
33
32
34
33
opts = Slop . parse do |o |
35
34
o . string '--token' , 'GitHub access token' , default : ''
36
35
o . boolean '--dry' , 'Make no round-trips to GitHub API (for testing)' , default : false
37
36
o . integer '--total' , 'Total number of repos to take from GitHub' , required : true
38
37
o . integer '--pause' , 'How many seconds to sleep between API calls' , default : 10
39
38
o . integer '--page-size' , 'Number of repos to fetch in one API call' , default : 100
40
- o . integer '--min-stars' , 'Minimum GitHub stars in each repo' , default : max
39
+ o . integer '--min-stars' , 'Minimum GitHub stars in each repo' , default : 1000
41
40
o . integer '--max-stars' , 'Maximum GitHub stars in each repo' , default : 100_000
42
41
o . integer '--min-size' , 'Minimum size of GitHub repo, in Kb' , default : 100
42
+ o . integer '--start-year' , 'The starting year for querying repositories' , default : Date . today . year
43
43
o . string '--csv' , 'The file name to save the list to' , required : true
44
44
o . string '--tex' , 'The file name to save LaTeX summary of the operation' , required : true
45
45
o . on '--help' do
48
48
end
49
49
end
50
50
51
- raise 'Can only retrieve up to 1000 repos' if opts [ :total ] > max
52
-
53
51
puts "Trying to find #{ opts [ :total ] } repos in GitHub"
54
52
size = [ opts [ :page_size ] , opts [ :total ] ] . min
55
53
puts "Taking up to #{ size } repos per one GitHub API request"
65
63
puts 'Accessing GitHub with personal access token!'
66
64
end
67
65
found = { }
68
- page = 0
69
- query = [
70
- "stars:#{ opts [ 'min-stars' ] } ..#{ opts [ 'max-stars' ] } " ,
71
- "size:>=#{ opts [ 'min-size' ] } " ,
72
- 'language:java' ,
73
- 'is:public' ,
74
- 'mirror:false' ,
75
- 'archived:false' ,
76
- 'template:false' ,
77
- 'NOT' ,
78
- 'android'
79
- ] . join ( ' ' )
80
-
81
66
def mock_array ( size , licenses )
82
67
Array . new ( size ) do
83
68
{
@@ -97,44 +82,105 @@ def mock_reps(page, size, licenses)
97
82
}
98
83
end
99
84
100
- def cooldown ( opts , found )
101
- puts "Let's sleep for #{ opts [ :pause ] } seconds to cool off GitHub API \
102
- (already found #{ found . count } repos, need #{ opts [ :total ] } )..."
103
- sleep opts [ :pause ]
85
+ def process_year ( year , github , context )
86
+ query = build_query ( year , context [ :opts ] )
87
+ puts "Querying for repositories created in #{ year } ..."
88
+ loop_through_pages ( query , github , context )
89
+ puts "Completed querying for year #{ year } . Found #{ context [ :found ] . count } repositories so far."
104
90
end
105
91
106
- puts 'Not searching GitHub API, using mock repos' if opts [ :dry ]
107
- loop do
108
- break if page * size > max
109
- count = 0
110
- json = if opts [ :dry ]
111
- mock_reps ( page , size , licenses )
92
+ def build_query ( year , opts )
93
+ [
94
+ "stars:#{ opts [ 'min-stars' ] } ..#{ opts [ 'max-stars' ] } " ,
95
+ "size:>=#{ opts [ 'min-size' ] } " ,
96
+ 'language:java' ,
97
+ "created:#{ year } -01-01..#{ year } -12-31" ,
98
+ 'is:public' ,
99
+ 'mirror:false' ,
100
+ 'archived:false' , 'template:false' , 'NOT' , 'android'
101
+ ] . join ( ' ' )
102
+ end
103
+
104
+ def loop_through_pages ( query , github , context )
105
+ page = 0
106
+ loop do
107
+ break if context [ :found ] . count >= context [ :opts ] [ :total ]
108
+ json = fetch_repositories ( query , github , page , context )
109
+ break if json [ :items ] . empty?
110
+ process_repositories ( json [ :items ] , context )
111
+ page += 1
112
+ cooldown ( context )
113
+ end
114
+ end
115
+
116
+ def fetch_repositories ( query , github , page , context )
117
+ if context [ :opts ] [ :dry ]
118
+ mock_reps ( page , context [ :size ] , context [ :licenses ] )
112
119
else
113
- github . search_repositories ( query , per_page : size , page : page )
120
+ github . search_repositories ( query , per_page : context [ : size] , page : page )
114
121
end
115
- json [ :items ] . each do |i |
116
- no_license = i [ :license ] . nil? || !licenses . include? ( i [ :license ] [ :key ] )
117
- puts "Repo #{ i [ :full_name ] } doesn't contain required license. Skipping" if no_license
118
- next if no_license
119
- count += 1
120
- found [ i [ :full_name ] ] = {
121
- full_name : i [ :full_name ] ,
122
- default_branch : i [ :default_branch ] ,
123
- stars : i [ :stargazers_count ] ,
124
- forks : i [ :forks_count ] ,
125
- created_at : i [ :created_at ] . iso8601 ,
126
- size : i [ :size ] ,
127
- open_issues_count : i [ :open_issues_count ] ,
128
- description : "\" #{ i [ :description ] } \" " ,
129
- topics : Array ( i [ :topics ] ) . join ( ' ' )
130
- }
131
- puts "Found #{ i [ :full_name ] . inspect } GitHub repo ##{ found . count } \
132
- (#{ i [ :forks_count ] } forks, #{ i [ :stargazers_count ] } stars) with license: #{ i [ :license ] [ :key ] } "
122
+ end
123
+
124
+ def process_repositories ( repositories , context )
125
+ repositories . each do |repo_data |
126
+ process_repo ( repo_data , context [ :found ] , context [ :licenses ] )
133
127
end
134
- puts "Found #{ count } good repositories in page ##{ page } (out of #{ json [ :items ] . count } )"
128
+ end
129
+
130
+ def process_repo ( repo_data , found , licenses )
131
+ return if repo_already_processed? ( repo_data , found )
132
+ return if license_invalid? ( repo_data , licenses )
133
+ add_repo_to_found ( repo_data , found )
134
+ print_repo_info ( repo_data , found )
135
+ end
136
+
137
+ def repo_already_processed? ( repo_data , found )
138
+ found . key? ( repo_data [ :full_name ] )
139
+ end
140
+
141
+ def license_invalid? ( repo_data , licenses )
142
+ no_license = repo_data [ :license ] . nil? || !licenses . include? ( repo_data [ :license ] [ :key ] )
143
+ puts "Repo #{ repo_data [ :full_name ] } doesn't contain required license. Skipping" if no_license
144
+ no_license
145
+ end
146
+
147
+ def add_repo_to_found ( repo_data , found )
148
+ found [ repo_data [ :full_name ] ] = {
149
+ full_name : repo_data [ :full_name ] ,
150
+ default_branch : repo_data [ :default_branch ] ,
151
+ created_at : repo_data [ :created_at ] . iso8601 ,
152
+ open_issues_count : repo_data [ :open_issues_count ] ,
153
+ description : "\" #{ repo_data [ :description ] } \" " ,
154
+ topics : Array ( repo_data [ :topics ] ) . join ( ' ' ) ,
155
+ stars : repo_data [ :stargazers_count ] , forks : repo_data [ :forks_count ] , size : repo_data [ :size ]
156
+ }
157
+ end
158
+
159
+ def print_repo_info ( repo , found )
160
+ puts "Found #{ repo [ :full_name ] . inspect } GitHub repo ##{ found . count } \
161
+ (#{ repo [ :forks_count ] } forks, #{ repo [ :stargazers_count ] } stars) with license: #{ repo [ :license ] [ :key ] } "
162
+ end
163
+
164
+ def cooldown ( context )
165
+ puts "Let's sleep for #{ context [ :opts ] [ :pause ] } seconds to cool off GitHub API \
166
+ (already found #{ context [ :found ] . count } repos, need #{ context [ :opts ] [ :total ] } )..."
167
+ sleep context [ :opts ] [ :pause ]
168
+ end
169
+
170
+ current_year = opts [ :start_year ]
171
+ years = ( 2008 ..current_year ) . to_a . reverse
172
+ final_query = ''
173
+
174
+ puts 'Not searching GitHub API, using mock repos' if opts [ :dry ]
175
+ years . each do |year |
135
176
break if found . count >= opts [ :total ]
136
- cooldown ( opts , found )
137
- page += 1
177
+ context = {
178
+ found : found ,
179
+ opts : opts ,
180
+ licenses : licenses ,
181
+ size : size
182
+ }
183
+ process_year ( year , github , context )
138
184
end
139
185
puts "Found #{ found . count } total repositories in GitHub"
140
186
@@ -158,7 +204,7 @@ def cooldown(opts, found)
158
204
' GitHub API\footnote{\url{https://docs.github.com/en/rest}}' ,
159
205
' was the following:' ,
160
206
'\begin{ffcode}' ,
161
- query . gsub ( ' ' , "\n " ) ,
207
+ final_query . gsub ( ' ' , "\n " ) ,
162
208
'\end{ffcode}'
163
209
] . join ( "\n " )
164
210
)
0 commit comments