diff --git a/Artifacts.toml b/Artifacts.toml new file mode 100644 index 00000000..b67e9d86 --- /dev/null +++ b/Artifacts.toml @@ -0,0 +1,2 @@ +[MNIST] +git-tree-sha1 = "bca582d83e460d262193b91a8d4eba481ce2d2f1" diff --git a/Project.toml b/Project.toml index 508fd367..d5a03359 100644 --- a/Project.toml +++ b/Project.toml @@ -9,7 +9,9 @@ DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" +LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" MAT = "23992714-dd62-5051-b70f-ba57cb901cac" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Requires = "ae029012-a4dd-5104-9daa-d747884805df" [compat] @@ -18,7 +20,6 @@ ColorTypes = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10" DataDeps = "0.3, 0.4, 0.5, 0.6, 0.7" FixedPointNumbers = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8" GZip = "0.5" -ImageCore = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8" MAT = "0.7, 0.8, 0.9, 0.10" Requires = "1" julia = "1" diff --git a/src/MNIST/MNIST.jl b/src/MNIST/MNIST.jl index 2bcee951..694f5906 100644 --- a/src/MNIST/MNIST.jl +++ b/src/MNIST/MNIST.jl @@ -23,11 +23,11 @@ the 10 possible digits (0-9). - [`MNIST.convert2image`](@ref) """ module MNIST - using DataDeps + using Pkg.Artifacts + using LazyArtifacts using ColorTypes using FixedPointNumbers - using ..MLDatasets: bytes_to_type, datafile, download_dep, download_docstring, - _colorview + using ..MLDatasets: bytes_to_type, _colorview export @@ -46,54 +46,31 @@ module MNIST @deprecate convert2features reshape - const DEPNAME = "MNIST" + const ARTIFACT_NAME = "MNIST" const TRAINIMAGES = "train-images-idx3-ubyte.gz" const TRAINLABELS = "train-labels-idx1-ubyte.gz" const TESTIMAGES = "t10k-images-idx3-ubyte.gz" const TESTLABELS = "t10k-labels-idx1-ubyte.gz" - """ - download([dir]; [i_accept_the_terms_of_use]) - - Trigger the (interactive) download of the full dataset into - "`dir`". If no `dir` is provided the dataset will be - downloaded into "~/.julia/datadeps/$DEPNAME". - - This function will display an interactive dialog unless - either the keyword parameter `i_accept_the_terms_of_use` or - the environment variable `DATADEPS_ALWAYS_ACCEPT` is set to - `true`. Note that using the data responsibly and respecting - copyright/terms-of-use remains your responsibility. - """ - download(args...; kw...) = download_dep(DEPNAME, args...; kw...) - + include(joinpath("Reader","Reader.jl")) include("interface.jl") include("utils.jl") - function __init__() - register(DataDep( - DEPNAME, - """ - Dataset: THE MNIST DATABASE of handwritten digits - Authors: Yann LeCun, Corinna Cortes, Christopher J.C. Burges - Website: http://yann.lecun.com/exdb/mnist/ - - [LeCun et al., 1998a] - Y. LeCun, L. Bottou, Y. Bengio, and P. Haffner. - "Gradient-based learning applied to document recognition." - Proceedings of the IEEE, 86(11):2278-2324, November 1998 - - The files are available for download at the offical - website linked above. Note that using the data - responsibly and respecting copyright remains your - responsibility. The authors of MNIST aren't really - explicit about any terms of use, so please read the - website to make sure you want to download the - dataset. - """, - "https://ossci-datasets.s3.amazonaws.com/mnist/" .* [TRAINIMAGES, TRAINLABELS, TESTIMAGES, TESTLABELS], - "0bb1d5775d852fc5bb32c76ca15a7eb4e9a3b1514a2493f7edfcf49b639d7975", - )) + + artifact_toml = joinpath(@__DIR__, "..", "..", "Artifacts.toml") + _hash = artifact_hash(ARTIFACT_NAME, artifact_toml) + + if _hash === nothing || !artifact_exists(_hash) + _hash = create_artifact() do artifact_dir + url_base = "https://ossci-datasets.s3.amazonaws.com/mnist/" + for file in [TRAINIMAGES, TRAINLABELS, + TESTIMAGES, TESTLABELS] + download("$url_base/$file", joinpath(artifact_dir, file)) + end + end + bind_artifact!(artifact_toml, ARTIFACT_NAME, _hash, lazy=true) end + + end diff --git a/src/MNIST/interface.jl b/src/MNIST/interface.jl index 06453d0a..56faf03e 100644 --- a/src/MNIST/interface.jl +++ b/src/MNIST/interface.jl @@ -1,5 +1,5 @@ """ - traintensor([T = N0f8], [indices]; [dir]) -> Array{T} + traintensor([T = N0f8], [indices]) -> Array{T} Returns the MNIST **training** images corresponding to the given `indices` as a multi-dimensional array of eltype `T`. @@ -42,21 +42,19 @@ julia> MNIST.convert2image(MNIST.traintensor(1)) # convert to column-major color 28×28 Array{Gray{N0f8},2}: [...] ``` - -$(download_docstring("MNIST", DEPNAME)) """ -function traintensor(::Type{T}, args...; dir = nothing) where T - path = datafile(DEPNAME, TRAINIMAGES, dir) +function traintensor(::Type{T}, args...) where T + path = joinpath(artifact"MNIST", TRAINIMAGES) images = Reader.readimages(path, args...) bytes_to_type(T, images) end -function traintensor(args...; dir = nothing) - traintensor(N0f8, args...; dir = dir) +function traintensor(args...) + traintensor(N0f8, args...) end """ - testtensor([T = N0f8], [indices]; [dir]) -> Array{T} + testtensor([T = N0f8], [indices]) -> Array{T} Returns the MNIST **test** images corresponding to the given `indices` as a multi-dimensional array of eltype `T`. @@ -99,21 +97,19 @@ julia> MNIST.convert2image(MNIST.testtensor(1)) # convert to column-major colora 28×28 Array{Gray{N0f8},2}: [...] ``` - -$(download_docstring("MNIST", DEPNAME)) """ -function testtensor(::Type{T}, args...; dir = nothing) where T - path = datafile(DEPNAME, TESTIMAGES, dir) +function testtensor(::Type{T}, args...) where T + path = joinpath(artifact"MNIST", TESTIMAGES) images = Reader.readimages(path, args...) bytes_to_type(T, images) end -function testtensor(args...; dir = nothing) - testtensor(N0f8, args...; dir = dir) +function testtensor(args...) + testtensor(N0f8, args...) end """ - trainlabels([indices]; [dir]) + trainlabels([indices]) Returns the MNIST **trainset** labels corresponding to the given `indices` as an `Int` or `Vector{Int}`. The values of the labels @@ -138,21 +134,19 @@ julia> MNIST.trainlabels(1:3) # first three labels julia> MNIST.trainlabels(1) # first label 5 ``` - -$(download_docstring("MNIST", DEPNAME)) """ -function trainlabels(args...; dir = nothing) - path = datafile(DEPNAME, TRAINLABELS, dir) +function trainlabels(args...) + path = joinpath(artifact"MNIST", TRAINLABELS) Vector{Int}(Reader.readlabels(path, args...)) end -function trainlabels(index::Integer; dir = nothing) - path = datafile(DEPNAME, TRAINLABELS, dir) +function trainlabels(index::Integer) + path = joinpath(artifact"MNIST", TRAINLABELS) Int(Reader.readlabels(path, index)) end """ - testlabels([indices]; [dir]) + testlabels([indices]) Returns the MNIST **testset** labels corresponding to the given `indices` as an `Int` or `Vector{Int}`. The values of the labels @@ -177,21 +171,19 @@ julia> MNIST.testlabels(1:3) # first three labels julia> MNIST.testlabels(1) # first label 7 ``` - -$(download_docstring("MNIST", DEPNAME)) """ -function testlabels(args...; dir = nothing) - path = datafile(DEPNAME, TESTLABELS, dir) +function testlabels(args...) + path = joinpath(artifact"MNIST", TESTLABELS) Vector{Int}(Reader.readlabels(path, args...)) end -function testlabels(index::Integer; dir = nothing) - path = datafile(DEPNAME, TESTLABELS, dir) +function testlabels(index::Integer) + path = joinpath(artifact"MNIST", TESTLABELS) Int(Reader.readlabels(path, index)) end """ - traindata([T = N0f8], [indices]; [dir]) -> Tuple + traindata([T = N0f8], [indices]) -> Tuple Returns the MNIST **trainingset** corresponding to the given `indices` as a two-element tuple. If `indices` is omitted the @@ -209,23 +201,20 @@ represent. ```julia train_x, train_y = MNIST.traindata() # full datatset train_x, train_y = MNIST.traindata(2) # only second observation -train_x, train_y = MNIST.traindata(dir="./MNIST") # custom folder ``` -$(download_docstring("MNIST", DEPNAME)) - Take a look at [`MNIST.traintensor`](@ref) and [`MNIST.trainlabels`](@ref) for more information. """ -function traindata(::Type{T}, args...; dir = nothing) where T - (traintensor(T, args...; dir = dir), - trainlabels(args...; dir = dir)) +function traindata(::Type{T}, args...) where T + (traintensor(T, args...), + trainlabels(args...)) end -traindata(args...; dir = nothing) = traindata(N0f8, args...; dir = dir) +traindata(args...) = traindata(N0f8, args...) """ - testdata([T = N0f8], [indices]; [dir]) -> Tuple + testdata([T = N0f8], [indices]) -> Tuple Returns the MNIST **testset** corresponding to the given `indices` as a two-element tuple. If `indices` is omitted the @@ -243,17 +232,14 @@ represent. ```julia test_x, test_y = MNIST.testdata() # full datatset test_x, test_y = MNIST.testdata(2) # only second observation -test_x, test_y = MNIST.testdata(dir="./MNIST") # custom folder ``` -$(download_docstring("MNIST", DEPNAME)) - Take a look at [`MNIST.testtensor`](@ref) and [`MNIST.testlabels`](@ref) for more information. """ -function testdata(::Type{T}, args...; dir = nothing) where T - (testtensor(T, args...; dir = dir), - testlabels(args...; dir = dir)) +function testdata(::Type{T}, args...) where T + (testtensor(T, args...), + testlabels(args...)) end -testdata(args...; dir = nothing) = testdata(N0f8, args...; dir = dir) +testdata(args...) = testdata(N0f8, args...)