Skip to content

Commit 9331a8a

Browse files
authored
Merge pull request #378 from lueFlake/more-than-1000-repos
#186 Workaround to find more than 1000 repositories
2 parents a194620 + 4b4d9cb commit 9331a8a

File tree

2 files changed

+98
-52
lines changed

2 files changed

+98
-52
lines changed

steps/discover-repos.rb

+97-51
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,18 @@
2828
require 'fileutils'
2929
require 'slop'
3030
require 'octokit'
31-
32-
max = 1000
31+
require 'date'
3332

3433
opts = Slop.parse do |o|
3534
o.string '--token', 'GitHub access token', default: ''
3635
o.boolean '--dry', 'Make no round-trips to GitHub API (for testing)', default: false
3736
o.integer '--total', 'Total number of repos to take from GitHub', required: true
3837
o.integer '--pause', 'How many seconds to sleep between API calls', default: 10
3938
o.integer '--page-size', 'Number of repos to fetch in one API call', default: 100
40-
o.integer '--min-stars', 'Minimum GitHub stars in each repo', default: max
39+
o.integer '--min-stars', 'Minimum GitHub stars in each repo', default: 1000
4140
o.integer '--max-stars', 'Maximum GitHub stars in each repo', default: 100_000
4241
o.integer '--min-size', 'Minimum size of GitHub repo, in Kb', default: 100
42+
o.integer '--start-year', 'The starting year for querying repositories', default: Date.today.year
4343
o.string '--csv', 'The file name to save the list to', required: true
4444
o.string '--tex', 'The file name to save LaTeX summary of the operation', required: true
4545
o.on '--help' do
@@ -48,8 +48,6 @@
4848
end
4949
end
5050

51-
raise 'Can only retrieve up to 1000 repos' if opts[:total] > max
52-
5351
puts "Trying to find #{opts[:total]} repos in GitHub"
5452
size = [opts[:page_size], opts[:total]].min
5553
puts "Taking up to #{size} repos per one GitHub API request"
@@ -65,19 +63,6 @@
6563
puts 'Accessing GitHub with personal access token!'
6664
end
6765
found = {}
68-
page = 0
69-
query = [
70-
"stars:#{opts['min-stars']}..#{opts['max-stars']}",
71-
"size:>=#{opts['min-size']}",
72-
'language:java',
73-
'is:public',
74-
'mirror:false',
75-
'archived:false',
76-
'template:false',
77-
'NOT',
78-
'android'
79-
].join(' ')
80-
8166
def mock_array(size, licenses)
8267
Array.new(size) do
8368
{
@@ -97,44 +82,105 @@ def mock_reps(page, size, licenses)
9782
}
9883
end
9984

100-
def cooldown(opts, found)
101-
puts "Let's sleep for #{opts[:pause]} seconds to cool off GitHub API \
102-
(already found #{found.count} repos, need #{opts[:total]})..."
103-
sleep opts[:pause]
85+
def process_year(year, github, context)
86+
query = build_query(year, context[:opts])
87+
puts "Querying for repositories created in #{year}..."
88+
loop_through_pages(query, github, context)
89+
puts "Completed querying for year #{year}. Found #{context[:found].count} repositories so far."
10490
end
10591

106-
puts 'Not searching GitHub API, using mock repos' if opts[:dry]
107-
loop do
108-
break if page * size > max
109-
count = 0
110-
json = if opts[:dry]
111-
mock_reps(page, size, licenses)
92+
def build_query(year, opts)
93+
[
94+
"stars:#{opts['min-stars']}..#{opts['max-stars']}",
95+
"size:>=#{opts['min-size']}",
96+
'language:java',
97+
"created:#{year}-01-01..#{year}-12-31",
98+
'is:public',
99+
'mirror:false',
100+
'archived:false', 'template:false', 'NOT', 'android'
101+
].join(' ')
102+
end
103+
104+
def loop_through_pages(query, github, context)
105+
page = 0
106+
loop do
107+
break if context[:found].count >= context[:opts][:total]
108+
json = fetch_repositories(query, github, page, context)
109+
break if json[:items].empty?
110+
process_repositories(json[:items], context)
111+
page += 1
112+
cooldown(context)
113+
end
114+
end
115+
116+
def fetch_repositories(query, github, page, context)
117+
if context[:opts][:dry]
118+
mock_reps(page, context[:size], context[:licenses])
112119
else
113-
github.search_repositories(query, per_page: size, page: page)
120+
github.search_repositories(query, per_page: context[:size], page: page)
114121
end
115-
json[:items].each do |i|
116-
no_license = i[:license].nil? || !licenses.include?(i[:license][:key])
117-
puts "Repo #{i[:full_name]} doesn't contain required license. Skipping" if no_license
118-
next if no_license
119-
count += 1
120-
found[i[:full_name]] = {
121-
full_name: i[:full_name],
122-
default_branch: i[:default_branch],
123-
stars: i[:stargazers_count],
124-
forks: i[:forks_count],
125-
created_at: i[:created_at].iso8601,
126-
size: i[:size],
127-
open_issues_count: i[:open_issues_count],
128-
description: "\"#{i[:description]}\"",
129-
topics: Array(i[:topics]).join(' ')
130-
}
131-
puts "Found #{i[:full_name].inspect} GitHub repo ##{found.count} \
132-
(#{i[:forks_count]} forks, #{i[:stargazers_count]} stars) with license: #{i[:license][:key]}"
122+
end
123+
124+
def process_repositories(repositories, context)
125+
repositories.each do |repo_data|
126+
process_repo(repo_data, context[:found], context[:licenses])
133127
end
134-
puts "Found #{count} good repositories in page ##{page} (out of #{json[:items].count})"
128+
end
129+
130+
def process_repo(repo_data, found, licenses)
131+
return if repo_already_processed?(repo_data, found)
132+
return if license_invalid?(repo_data, licenses)
133+
add_repo_to_found(repo_data, found)
134+
print_repo_info(repo_data, found)
135+
end
136+
137+
def repo_already_processed?(repo_data, found)
138+
found.key?(repo_data[:full_name])
139+
end
140+
141+
def license_invalid?(repo_data, licenses)
142+
no_license = repo_data[:license].nil? || !licenses.include?(repo_data[:license][:key])
143+
puts "Repo #{repo_data[:full_name]} doesn't contain required license. Skipping" if no_license
144+
no_license
145+
end
146+
147+
def add_repo_to_found(repo_data, found)
148+
found[repo_data[:full_name]] = {
149+
full_name: repo_data[:full_name],
150+
default_branch: repo_data[:default_branch],
151+
created_at: repo_data[:created_at].iso8601,
152+
open_issues_count: repo_data[:open_issues_count],
153+
description: "\"#{repo_data[:description]}\"",
154+
topics: Array(repo_data[:topics]).join(' '),
155+
stars: repo_data[:stargazers_count], forks: repo_data[:forks_count], size: repo_data[:size]
156+
}
157+
end
158+
159+
def print_repo_info(repo, found)
160+
puts "Found #{repo[:full_name].inspect} GitHub repo ##{found.count} \
161+
(#{repo[:forks_count]} forks, #{repo[:stargazers_count]} stars) with license: #{repo[:license][:key]}"
162+
end
163+
164+
def cooldown(context)
165+
puts "Let's sleep for #{context[:opts][:pause]} seconds to cool off GitHub API \
166+
(already found #{context[:found].count} repos, need #{context[:opts][:total]})..."
167+
sleep context[:opts][:pause]
168+
end
169+
170+
current_year = opts[:start_year]
171+
years = (2008..current_year).to_a.reverse
172+
final_query = ''
173+
174+
puts 'Not searching GitHub API, using mock repos' if opts[:dry]
175+
years.each do |year|
135176
break if found.count >= opts[:total]
136-
cooldown(opts, found)
137-
page += 1
177+
context = {
178+
found: found,
179+
opts: opts,
180+
licenses: licenses,
181+
size: size
182+
}
183+
process_year(year, github, context)
138184
end
139185
puts "Found #{found.count} total repositories in GitHub"
140186

@@ -158,7 +204,7 @@ def cooldown(opts, found)
158204
' GitHub API\footnote{\url{https://docs.github.com/en/rest}}',
159205
' was the following:',
160206
'\begin{ffcode}',
161-
query.gsub(' ', "\n"),
207+
final_query.gsub(' ', "\n"),
162208
'\end{ffcode}'
163209
].join("\n")
164210
)

tests/steps/test-discover-repos.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ tex=${TARGET}/foo.tex
3232
rm -f "${csv}"
3333
msg=$("${LOCAL}/steps/discover-repos.rb" --dry --pause=0 --total=3 --page-size=1 --min-stars=100 --max-stars=1000 "--csv=${csv}" "--tex=${tex}")
3434
echo "${msg}"
35-
echo "${msg}" | grep "Found 1 good repositories in page #0"
35+
echo "${msg}" | grep "Completed querying for year 2024. Found 3 repositories so far."
3636
echo "${msg}" | grep "Found 3 total repositories in GitHub"
3737
test -e "${csv}"
3838
test -s "${tex}"

0 commit comments

Comments
 (0)