Skip to content

Commit

Permalink
Check existence before downloading/extracting(#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
salu133445 committed Jan 15, 2021
1 parent be4bd7a commit 2973343
Show file tree
Hide file tree
Showing 9 changed files with 70 additions and 106 deletions.
55 changes: 31 additions & 24 deletions docs/_modules/muspy/datasets/base.html

Large diffs are not rendered by default.

12 changes: 1 addition & 11 deletions docs/datasets/base.html
Original file line number Diff line number Diff line change
Expand Up @@ -217,11 +217,6 @@ <h1>Base Dataset Classes<a class="headerlink" href="#base-dataset-classes" title
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down Expand Up @@ -404,7 +399,7 @@ <h1>Base Dataset Classes<a class="headerlink" href="#base-dataset-classes" title
<dl class="py method">
<dt>
<code class="sig-name descname">download</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">overwrite</span><span class="o">=</span><span class="default_value">False</span></em>, <em class="sig-param"><span class="n">verbose</span><span class="o">=</span><span class="default_value">True</span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/muspy/datasets/base.html#RemoteDataset.download"><span class="viewcode-link">[source]</span></a></dt>
<dd><p>Download the source datasets.</p>
<dd><p>Download the dataset source(s).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
Expand Down Expand Up @@ -494,11 +489,6 @@ <h1>Base Dataset Classes<a class="headerlink" href="#base-dataset-classes" title
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down
15 changes: 0 additions & 15 deletions docs/datasets/local.html
Original file line number Diff line number Diff line change
Expand Up @@ -358,11 +358,6 @@ <h1>Local Dataset Classes<a class="headerlink" href="#local-dataset-classes" tit
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down Expand Up @@ -524,11 +519,6 @@ <h1>Local Dataset Classes<a class="headerlink" href="#local-dataset-classes" tit
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down Expand Up @@ -742,11 +732,6 @@ <h1>Local Dataset Classes<a class="headerlink" href="#local-dataset-classes" tit
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down
21 changes: 3 additions & 18 deletions docs/datasets/remote.html
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ <h1>Remote Dataset Classes<a class="headerlink" href="#remote-dataset-classes" t
<dl class="py method">
<dt>
<code class="sig-name descname">download</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">overwrite</span><span class="o">=</span><span class="default_value">False</span></em>, <em class="sig-param"><span class="n">verbose</span><span class="o">=</span><span class="default_value">True</span></em><span class="sig-paren">)</span></dt>
<dd><p>Download the source datasets.</p>
<dd><p>Download the dataset source(s).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
Expand Down Expand Up @@ -386,11 +386,6 @@ <h1>Remote Dataset Classes<a class="headerlink" href="#remote-dataset-classes" t
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down Expand Up @@ -561,7 +556,7 @@ <h1>Remote Dataset Classes<a class="headerlink" href="#remote-dataset-classes" t
<dl class="py method">
<dt>
<code class="sig-name descname">download</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">overwrite</span><span class="o">=</span><span class="default_value">False</span></em>, <em class="sig-param"><span class="n">verbose</span><span class="o">=</span><span class="default_value">True</span></em><span class="sig-paren">)</span></dt>
<dd><p>Download the source datasets.</p>
<dd><p>Download the dataset source(s).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
Expand Down Expand Up @@ -651,11 +646,6 @@ <h1>Remote Dataset Classes<a class="headerlink" href="#remote-dataset-classes" t
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down Expand Up @@ -823,7 +813,7 @@ <h1>Remote Dataset Classes<a class="headerlink" href="#remote-dataset-classes" t
<dl class="py method">
<dt>
<code class="sig-name descname">download</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">overwrite</span><span class="o">=</span><span class="default_value">False</span></em>, <em class="sig-param"><span class="n">verbose</span><span class="o">=</span><span class="default_value">True</span></em><span class="sig-paren">)</span></dt>
<dd><p>Download the source datasets.</p>
<dd><p>Download the dataset source(s).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
Expand Down Expand Up @@ -939,11 +929,6 @@ <h1>Remote Dataset Classes<a class="headerlink" href="#remote-dataset-classes" t
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down
7 changes: 1 addition & 6 deletions docs/doc/datasets.html
Original file line number Diff line number Diff line change
Expand Up @@ -295,11 +295,6 @@ <h2>Dataset Classes<a class="headerlink" href="#dataset-classes" title="Permalin
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down Expand Up @@ -899,7 +894,7 @@ <h2>Dataset Classes<a class="headerlink" href="#dataset-classes" title="Permalin
<dl class="py method">
<dt id="muspy.datasets.RemoteDataset.download">
<code class="sig-name descname">download</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">overwrite</span><span class="o">=</span><span class="default_value">False</span></em>, <em class="sig-param"><span class="n">verbose</span><span class="o">=</span><span class="default_value">True</span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/muspy/datasets/base.html#RemoteDataset.download"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#muspy.datasets.RemoteDataset.download" title="Permalink to this definition"></a></dt>
<dd><p>Download the source datasets.</p>
<dd><p>Download the dataset source(s).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
Expand Down
7 changes: 1 addition & 6 deletions docs/doc/muspy.html
Original file line number Diff line number Diff line change
Expand Up @@ -1579,11 +1579,6 @@ <h2>Features<a class="headerlink" href="#features" title="Permalink to this head
</ul>
</dd>
</dl>
<p class="rubric">Notes</p>
<p>The converted files will be named by its index. The original
filenames can be found in the <code class="docutils literal notranslate"><span class="pre">filenames</span></code> attribute. For
example, the file at <code class="docutils literal notranslate"><span class="pre">filenames[i]</span></code> will be converted and
saved to <code class="docutils literal notranslate"><span class="pre">{i}.json</span></code>.</p>
</dd></dl>

<dl class="py method">
Expand Down Expand Up @@ -2183,7 +2178,7 @@ <h2>Features<a class="headerlink" href="#features" title="Permalink to this head
<dl class="py method">
<dt id="muspy.RemoteDataset.download">
<code class="sig-name descname">download</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">overwrite</span><span class="o">=</span><span class="default_value">False</span></em>, <em class="sig-param"><span class="n">verbose</span><span class="o">=</span><span class="default_value">True</span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/muspy/datasets/base.html#RemoteDataset.download"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#muspy.RemoteDataset.download" title="Permalink to this definition"></a></dt>
<dd><p>Download the source datasets.</p>
<dd><p>Download the dataset source(s).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
Expand Down
2 changes: 1 addition & 1 deletion docs/searchindex.js

Large diffs are not rendered by default.

53 changes: 30 additions & 23 deletions muspy/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,20 @@
)

import numpy as np
from joblib import Parallel, delayed
from numpy.random import RandomState, permutation
from tqdm import tqdm

from ..inputs import load, read_abc_string
from ..music import Music
from ..outputs import save
from .utils import download_url, extract_archive
from .utils import (
check_md5,
check_sha256,
check_size,
download_url,
extract_archive,
)

try:
from torch.utils.data import Dataset as TorchDataset
Expand All @@ -38,12 +45,6 @@
except ImportError:
HAS_TENSORFLOW = False

try:
from joblib import Parallel, delayed

HAS_JOBLIB = True
except ImportError:
HAS_JOBLIB = False

RemoteDatasetType = TypeVar("RemoteDatasetType", bound="RemoteDataset")
FolderDatasetType = TypeVar("FolderDatasetType", bound="FolderDataset")
Expand Down Expand Up @@ -133,13 +134,6 @@ def save(
**kwargs
Keyword arguments to pass to :func:`muspy.save`.
Notes
-----
The converted files will be named by its index. The original
filenames can be found in the ``filenames`` attribute. For
example, the file at ``filenames[i]`` will be converted and
saved to ``{i}.json``.
"""
if kind not in ("json", "yaml"):
raise TypeError("`kind` must be either 'json' or 'yaml'.")
Expand Down Expand Up @@ -171,19 +165,13 @@ def _saver(idx):
if _saver(idx):
count += 1
else:
if not HAS_JOBLIB:
raise ValueError(
"Optional package joblib is required for multiprocessing "
"(n_jobs > 1)."
)
# TODO: This is slow as `self` is passed between workers.
results = Parallel(n_jobs=n_jobs, backend="threading", verbose=5)(
delayed(_saver)(idx) for idx in range(len(self))
)
count = results.count(True)
if verbose:
print(f"Successfully saved {count} out of {len(self)} files.")
(root / ".muspy.success").touch(exist_ok=True)

def split(
self,
Expand Down Expand Up @@ -531,14 +519,20 @@ def source_exists(self) -> bool:
filename = self.root / source["filename"]
if not filename.is_file():
return False
if "size" in source and filename.stat().st_size != source["size"]:
if "size" in source and not check_size(filename, source["size"]):
return False
if "md5" in source and not check_md5(filename, source["md5"]):
return False
if "sha256" in source and not check_sha256(
filename, source["sha256"]
):
return False
return True

def download(
self: RemoteDatasetType, overwrite: bool = False, verbose: bool = True
) -> RemoteDatasetType:
"""Download the source datasets.
"""Download the dataset source(s).
Parameters
----------
Expand All @@ -552,6 +546,13 @@ def download(
Object itself.
"""
if self.exists():
if verbose:
print(
"Skip downloading as the `.muspy.success` file is found."
)
return self

for source in self._sources.values():
download_url(
source["url"],
Expand Down Expand Up @@ -582,6 +583,11 @@ def extract(
Object itself.
"""
if self.exists():
if verbose:
print("Skip extracting as the `.muspy.success` file is found.")
return self

for source in self._sources.values():
filename = self.root / source["filename"]
if source["archive"]:
Expand Down Expand Up @@ -997,7 +1003,7 @@ def convert(
"""
if self.converted_exists():
if verbose:
print("Skip conversion as the converted folder exists.")
print("Skip conversion as the `.muspy.success` file is found.")
return self
self.on_the_fly()
self.converted_dir.mkdir(exist_ok=True)
Expand All @@ -1009,6 +1015,7 @@ def convert(
verbose=verbose,
**kwargs,
)
(self.converted_dir / ".muspy.success").touch(exist_ok=True)
self.use_converted()
self.kind = kind
return self
Expand Down
4 changes: 2 additions & 2 deletions muspy/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def download_url(
"""
path = Path(path)
if not overwrite and path.is_file():
if size is not None and path.stat().st_size != size:
if size is not None and not check_size(path, size):
raise RuntimeError(
"Existing file has a different size from the expected one."
)
Expand All @@ -176,7 +176,7 @@ def download_url(
urlretrieve(url, path)

# Run checks
if size is not None and path.stat().st_size != size:
if size is not None and not check_size(path, size):
raise RuntimeError(
"Downloaded file has a different size from the expected one."
)
Expand Down

0 comments on commit 2973343

Please sign in to comment.