-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetcoris.d
226 lines (187 loc) · 7.24 KB
/
getcoris.d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/rdmd
// "Bad interpreter"? Download dmd from <https://dlang.org>.
// This program downloads as many as possible of the files listed at
// <https://github.com/climate-mirror/datasets/issues/272>. Instructions for
// Linux:
//
// 1. Copy the long list of URLs from that Github issue into a file called
// links.txt
//
// 2. Visit <https://dlang.org>; download and install DMD, the reference D
// compiler.
//
// 3. Make sure lftp is installed on your computer. On Linux, `which lftp' will
// show something if lftp is installed and not otherwise.
//
// 4. Download getcoris.d into a new directory.
//
// 5. Make it executable:
//
// chmod +x getcoris.d
//
// 6. Run it:
// ./getcoris.d
//
// It'll take a few seconds to compile, and then it'll start producing
// screen output to show its progress. It'll create directory trees
// automatically. To see what you've downloaded, type `find'.
//
// 7. getcoris.d downloads files in a random order, partly to spread the load
// across servers (to make best use of your banwidth and reduce the chances
// of your being banned), and partly so that several people running
// getcoris.d will achieve greater coverage, even if none of them manages a
// complete download.
//
// 8. If you interrupt getcoris.d with Ctrl+C and later restart it, it'll
// first download the files it was in the middle of downloading before (to
// minimise the number of truncated files on your file system) and then
// download only those files it doesn't already have. Nevertheless, not
// every server supports resumable downloads, so try to interrupt
// getcoris.d as infrequently as possible.
//
// This is not an industrial-strength downloader, and doesn't cope robustly with
// (for example) I/O errors in the progress file. It's a throw-away program to
// download one large data set, starting as soon as possible. Sometimes, good
// enough is good enough.
//
// This program has been tested on Linux only. I haven't tested it on Windows,
// because my employer asks me not to use its resources for OSS development,
// but I'll happily take bug reports or (even better) pull requests from Windows
// users.
//
// To do:
// * Do we need to get screen output under control? Show what's completed
// as well as what's started?
import std.algorithm.comparison;
import std.algorithm.iteration;
import std.array;
import std.file;
import std.format;
import std.parallelism;
import std.process;
import std.random;
import std.range;
import std.regex;
import std.stdio;
// How many parallel downloads would you like?
enum NrParallelJobs = 4;
// What's the name of your text file full of links?
enum LinkFileName = "links.txt";
// What's the name of the progress file that enables us to resume downloading
// if we're interrupted?
enum ProgressFileName = "progress.txt";
// False to download files, true just to say what we'd do:
enum DryRun = false;
// Status of a single download or mirror operation:
enum Progress {None, Started, Finished}
// Update the progress file:
auto update(in Progress progress, in char[] url) {
assert(progress != Progress.None);
const prefix = progress == Progress.Started? "S ": "F ";
synchronized {
auto fh = File(ProgressFileName, "a");
fh.writeln(prefix, url);
}
}
// Run a single command, or just dump it to the screen if in dry-run mode:
auto execute(const char[][] argv, const char[] url) {
update(Progress.Started, url);
if (DryRun) {
import core.thread;
writeln(argv);
Thread.sleep(dur!"seconds"(2));
update(Progress.Finished, url);
}
else {
auto pid = spawnProcess(argv);
const rc = wait(pid);
if (rc != 0)
writeln("Warning: exit code ", rc, " from ", argv.join(' '));
else
update(Progress.Finished, url);
}
}
// Use lftp to mirror an entire directory tree:
auto mirror_directory(in char[] url, in char[] directory) {
mkdirRecurse(directory);
const commands = format("open %s; mirror --continue --parallel=2 . %s", url, directory);
const args = ["lftp", "-c", commands];
execute(args, url);
}
// Use lftp to download a single file:
enum rx_path_and_base = ctRegex!(`(.+) / (.+)`, "x");
auto download_file(in char[] url, in char[] resource) {
const(char)[] local_directory, remote_directory, base_name;
if (auto caps = resource.matchFirst(rx_path_and_base)) {
local_directory = caps[1];
base_name = caps[2];
}
else
return;
if (auto caps = url.matchFirst(rx_path_and_base))
remote_directory = caps[1];
else
return;
// Now, for ftp://ftp.example.com/data/subdir/foo.txt,
// * local_directory = "ftp.example.com/data/subdir"
// * remote_directory = "ftp://ftp.example.com/data/subdir"
// * base_name = "foo.txt"
mkdirRecurse(local_directory);
const commands = format("open %s; get -c -O %s %s", remote_directory, local_directory, base_name);
const args = ["lftp", "-c", commands];
execute(args, url);
}
// Handle a single URL of any type:
enum rx_no_protocol = ctRegex!(` ^ ([a-z]+ ://) (.{4,})`, "x");
enum rx_is_directory = ctRegex!(` / $ `, "x");
enum rx_usable_file = ctRegex!(` \. (?! asp x? | htm l? | php) [a-z]+ $ `, "x");
auto do_one_url(const(char)[] url) {
const(char)[] protocol, resource;
if (auto caps = url.matchFirst(rx_no_protocol)) {
protocol = caps[1];
resource = caps[2];
}
else
return;
if (url.matchFirst(rx_is_directory))
mirror_directory(url, resource[0 .. $-2]);
else if (url.matchFirst(rx_usable_file))
download_file(url, resource);
}
// Upon startup, read the progress file maintained by update() above:
alias ProgressFile = Progress[string];
enum rx_progress_line = ctRegex!(`^ ([SF]) \s (\S+)`, "x");
auto read_progress_file(ref ProgressFile pfile) {
// The first time getcoris is run, the progress file won't exist:
if (!exists(ProgressFileName))
return;
// If it does exist, though, it'd better be readable, because we don't
// want to waste bandwidth by silently failing to read it.
foreach (line; File(ProgressFileName).byLineCopy)
if (auto caps = line.matchFirst(rx_progress_line)) {
auto indicator = caps[1], url = caps[2];
auto progress = indicator[0] == 'S'? Progress.Started: Progress.Finished;
if (auto ptr = url in pfile)
*ptr = max(*ptr, progress);
else
pfile[url] = progress;
}
}
static assert(NrParallelJobs > 1);
// Read the progress file and the list of URLs; download all files we've not
// yet downloaded, starting with any work in progress:
void main() {
ProgressFile prog_file;
read_progress_file(prog_file);
auto started_urls = prog_file.byKey
.filter!(url => (prog_file[url] == Progress.Started));
auto unstarted_urls = File(LinkFileName)
.byLineCopy
.filter!(url => url !in prog_file)
.array
.randomCover;
auto pending_urls = chain(started_urls, unstarted_urls);
defaultPoolThreads = NrParallelJobs - 1;
foreach (link; parallel(pending_urls, 1))
do_one_url(link);
}