chore: testing workflow

openedx · Apr 16, 2024 · 5eaa65d · 5eaa65d
1 parent c848767
commit 5eaa65d
Show file tree

Hide file tree

Showing 2 changed files with 397 additions and 0 deletions.
diff --git a/.github/workflows/check_dependencies.yml b/.github/workflows/check_dependencies.yml
@@ -0,0 +1,24 @@
+name: Check Python Dependencies
+
+on:
+  pull_request:
+
+defaults:
+  run:
+    shell: bash # strict bash
+
+jobs:
+  check_dependencies:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+
+      - name: Run Python script
+        run: python find_dependencies.py
diff --git a/scripts/find_dependencies.py b/scripts/find_dependencies.py
@@ -0,0 +1,373 @@
+"""
+Spider and catalog dependencies.
+
+$ export OLIVE_DIRS=$(gittreeif origin/open-release/olive.master -q pwd)
+$ python find_deps.py $OLIVE_DIRS
+
+"""
+
+import concurrent.futures
+import contextlib
+import functools
+import itertools
+import json
+import os
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+import tarfile
+import time
+import zipfile
+from pathlib import Path
+from typing import Iterable, Optional, Tuple
+
+import requests
+from rich.progress import Progress, MofNCompleteColumn
+
+# pylint: disable=unspecified-encoding
+
+cached = functools.lru_cache(maxsize=0)
+
+@contextlib.contextmanager
+def change_dir(new_dir):
+    """
+    Change to a new directory, and then change back.
+
+    Will make the directory if needed.
+    """
+    old_dir = os.getcwd()
+    new_dir = Path(new_dir)
+    new_dir.mkdir(parents=True, exist_ok=True)
+    os.chdir(new_dir)
+    try:
+        yield new_dir
+    finally:
+        os.chdir(old_dir)
+
+
+def run_command(cmd: str, outfile=None) -> Tuple[bool, str]:
+    """
+    Run a command line (with no shell).  Write the output to a file.
+
+    Returns a tuple:
+        bool: true if the command succeeded.
+        str: the output of the command.
+
+    """
+    proc = subprocess.run(
+        shlex.split(cmd, posix=False),
+        shell=False,
+        check=False,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    output = proc.stdout.decode("utf-8")
+    if outfile:
+        Path(outfile).write_text(output)
+
+    return proc.returncode == 0, output.strip()
+
+
+@cached
+def npm_repo_url(npm_spec: str) -> Optional[str]:
+    """Given '[email protected]', return a repo url."""
+    pkg, _, ver = npm_spec.rpartition("@")
+    url = f"https://registry.npmjs.org/{pkg}/{ver}"
+    try:
+        resp = requests.get(url, timeout=60)
+        if resp.status_code != 200:
+            print(f"{npm_spec}: {url} -> {resp.status_code}")
+            return None
+        jdata = resp.json()
+    except requests.RequestException as exc:
+        print(f"Couldn't fetch npm data for {npm_spec}: {exc}")
+        return None
+    repo = jdata.get("repository")
+    if repo is None:
+        return None
+    if isinstance(repo, dict):
+        repo = repo["url"]
+    return repo
+
+def canonical_url(url: str) -> str:
+    """Canonicalize a repo URL, probably on GitHub."""
+    for pat, repl in [
+        (r"^git\+", ""),
+        (r"#.$", ""),
+        (r"\.git$", ""),
+        (r"^git:", "https:"),
+        (r"^ssh://git@", "https://"),
+        (r"^[email protected]:", "https://github.com/"),
+    ]:
+        url = re.sub(pat, repl, url)
+    if ":" not in url and url.count("/") == 1:
+        url = f"https://github.com/{url}"
+    return url
+
+@cached
+def find_real_url(url: str) -> Optional[str]:
+    """Find the eventual real url for a redirected url."""
+    while True:
+        try:
+            resp = requests.head(url, timeout=60, allow_redirects=True)
+        except requests.RequestException as exc:
+            print(f"Couldn't fetch {url}: {exc}")
+            return None
+        if resp.status_code == 429:
+            # I didn't know you could get 429 from https://github.com, but you can...
+            wait = int(resp.headers.get("Retry-After", 10))
+            time.sleep(wait + 1)
+        else:
+            break
+
+    if resp.status_code == 200:
+        return resp.url
+    return None
+
+
+WORK_DIR = Path("/tmp/unpack_reqs")
+
+def parallel_map(func, data, description):
+    """Run func over data using threads, with a progress bar."""
+    data = list(data)
+    n_workers = os.cpu_count() or 1
+    with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
+        with Progress(*Progress.get_default_columns(), MofNCompleteColumn()) as progress:
+            pbar = progress.add_task(f"{description:20}", total=len(data))
+            for result in executor.map(func, data):
+                progress.update(pbar, advance=1)
+                yield result
+
+def write_list(path: str, lines: Iterable[str]):
+    """Write a list of strings to a file."""
+    with Path(path).open("w") as flist:
+        for line in lines:
+            print(line, file=flist)
+
+def check_js_dependencies() -> Iterable[str]:
+    """Check the JS dependencies in package-lock.json, returning a set of repo URLs."""
+    print("Checking JavaScript dependencies")
+    with Path("package-lock.json").open() as lockf:
+        lock_data = json.load(lockf)
+
+    deps = set()
+    for name, pkg in lock_data["packages"].items():
+        name = pkg.get("name") or name
+        name = name.rpartition("node_modules/")[-1]
+        version = pkg.get("version")
+        if version is None:
+            continue
+        deps.add(f"{name}@{version}")
+    write_list("deps.txt", sorted(deps))
+
+    urls = set()
+    for url in parallel_map(npm_repo_url, deps, "Getting npm URLs"):
+        if url:
+            urls.add(canonical_url(url))
+
+    real_urls = set()
+    for url in parallel_map(find_real_url, urls, "Getting real URLs"):
+        if url:
+            real_urls.add(url)
+
+    print(f"{len(deps)} deps, {len(urls)} urls, {len(real_urls)} real urls")
+    write_list("repo_urls.txt", sorted(real_urls))
+    return real_urls
+
+def check_py_dependencies() -> Iterable[str]:
+    """Check the Python dependencies in base.txt, returning a set of repo URLs."""
+    print("Checking Python dependencies")
+
+    print("Creating venv")
+    run_command("python3 -m venv .venv", "make_venv.log")
+    run_command(".venv/bin/python3 -m pip install -U pip", "pip_upgrade.log")
+    print("Downloading packages")
+    run_command(".venv/bin/python3 -m pip download --dest files -r base.txt", "pip_download.log")
+
+    urls = set()
+    for url in parallel_map(repo_url_from_wheel, Path("files").glob("*.whl"), "Examining wheels"):
+        if url:
+            urls.add(canonical_url(url))
+
+    for url in parallel_map(repo_url_from_tgz, Path("files").glob("*.tar.gz"), "Examining tar.gz"):
+        if url:
+            urls.add(canonical_url(url))
+
+    with open("base.txt") as fbase:
+        for line in fbase:
+            if match := re.search(r"https://github.com[^@ #]*(\.git)?", line):
+                urls.add(canonical_url(match[0]))
+
+    real_urls = set()
+    for url in parallel_map(find_real_url, urls, "Getting real URLs"):
+        if url:
+            real_urls.add(url)
+
+    write_list("repo_urls.txt", sorted(real_urls))
+    return real_urls
+
+def matching_text(text, regexes):
+    """Find a line in text matching a regex, and return the first regex group."""
+    for regex in regexes:
+        for line in text.splitlines():
+            if match := re.search(regex, line):
+                return match[1]
+    return None
+
+@cached
+def repo_url_from_wheel(wheel_path: str) -> Optional[str]:
+    """Read metadata from a .whl file, returning the repo URL."""
+    with zipfile.ZipFile(wheel_path) as whl_file:
+        fmetadata = next((f for f in whl_file.namelist() if f.endswith("/METADATA")), None)
+        if fmetadata is None:
+            print(f"No metadata in {wheel_path}")
+            return None
+        with whl_file.open(fmetadata) as inner_file:
+            metadata = inner_file.read().decode("utf-8")
+        return repo_url_from_metadata(wheel_path, metadata)
+
+@cached
+def repo_url_from_tgz(tgz_path: str) -> Optional[str]:
+    """Read metadata from a .tar.gz file, returning the repo URL."""
+    with tarfile.open(tgz_path) as tgz_file:
+        fmetadata = next((f for f in tgz_file.getnames() if f.endswith("/PKG-INFO")), None)
+        if fmetadata is None:
+            print(f"No metadata in {tgz_path}")
+            return None
+        metadata = tgz_file.extractfile(fmetadata).read().decode("utf-8")
+        return repo_url_from_metadata(tgz_path, metadata)
+
+
+SOURCE_URL_REGEXES = [
+    # These regexes are tried in order. The first group is the extracted URL.
+    r"(?i)^Project-URL: Source.*,\s*(.*)$",
+    r"(?i)^Home-page: (.*)$",
+    r"(?i)^Project-URL: Home.*,\s*(.*)$",
+    # If they point to GitHub issues, that's their repo.
+    r"(?i)^Project-URL: [^,]+,\s*(https?://github.com/[^/]+/[^/]+)/issues/?$",
+    # If we can't find a URL marked as home, then use any GitHub repo URL.
+    r"(?i)^Project-URL: [^,]+,\s*(https?://github.com/[^/]+/[^/]+)$",
+]
+
+def repo_url_from_metadata(filename, metadata):
+    """Find the likely source repo URL from PyPI metadata."""
+    repo_url = matching_text(metadata, SOURCE_URL_REGEXES)
+    if repo_url is None:
+        print(f"No repo URL in {filename}")
+        return None
+    if repo_url == "UNKNOWN":
+        print(f"Repo URL is UNKNOWN in {filename}")
+        return None
+    return repo_url
+
+# The first of these we find is the requirements file we'll examine:
+PY_REQS = [
+    "requirements/edx/base.txt",
+    "requirements/base.txt",
+    "requirements.txt",
+]
+
+# Files that indicate a repo is a Python project:
+PY_INDICATORS = [
+    "setup.py",
+    "setup.cfg",
+    "pyproject.toml",
+]
+
+def find_py_reqs():
+    """Find the Python requirements file to use."""
+    for fname in PY_REQS:
+        possible_req = Path(fname)
+        if possible_req.exists():
+            return possible_req
+    if any(Path(ind).exists() for ind in PY_INDICATORS):
+        print(f"WARNING: {os.getcwd()} is likely a Python package, but we can't find its dependencies.")
+    return None
+
+
+def process_directory():
+    """
+    Find all the requirements in the current directory. Returns a set of repo URLs.
+
+    Also copies the considered dependencies file into the temp work directory,
+    for later analysis.
+    """
+    repo_name = Path.cwd().name
+    repo_work = WORK_DIR / repo_name
+    repo_work.mkdir(parents=True, exist_ok=True)
+    repo_urls = set()
+    if (js_reqs := Path("package-lock.json")).exists():
+        shutil.copyfile(js_reqs, repo_work / "package-lock.json")
+        with change_dir(repo_work):
+            repo_urls.update(check_js_dependencies())
+    if (py_reqs := find_py_reqs()):
+        shutil.copyfile(py_reqs, repo_work / "base.txt")
+        with change_dir(repo_work):
+            repo_urls.update(check_py_dependencies())
+    return repo_urls
+
+FIRST_PARTY_ORGS = ["openedx"]
+
+SECOND_PARTY_ORGS = [
+    "edx", "edx-unsupported", "edx-solutions",
+    "mitodl",
+    "overhangio",
+    "open-craft", "eduNEXT", "raccoongang",
+]
+
+# def urls_in_orgs(urls, orgs):
+#     """
+#     Find urls that are in any of the `orgs`.
+#     """
+#     return sorted(
+#         url for url in urls
+#         if any(f"/{org}/" in url for org in orgs)
+#     )
+
+def urls_in_orgs(urls, org):
+    """
+    Find urls that are in any of the `orgs`.
+    """
+    return sorted(
+        url for url in urls
+        if f"/{org}/" in url
+    )    
+
+
+def main(dirs=None, org=None):
+    """
+    Analyze the requirements in all of the directories mentioned on the command line.
+    If arguments have newlines, treat each line as a separate directory.
+    """
+    if dirs is None:
+        repo_dir = sys.argv[1]
+        org_flag_index = sys.argv.index("--org")
+        org = sys.argv[org_flag_index + 1]
+    #import pdb; pdb.set_trace()
+    print(f"Creating new work directory: {WORK_DIR}")
+    shutil.rmtree(WORK_DIR, ignore_errors=True)
+
+    #dirs = sorted(itertools.chain.from_iterable(d.splitlines() for d in dirs))
+    repo_urls = set()
+
+    #for i_dir, repo_dir in enumerate(dirs, start=1):
+#        print(f"-- repo {repo_dir} ({i_dir}/{len(dirs)}) ----------")
+    with change_dir(repo_dir):
+        repo_urls.update(process_directory())
+
+    write_list(WORK_DIR / "repo_urls.txt", sorted(repo_urls))
+
+    #firsts = urls_in_orgs(repo_urls, FIRST_PARTY_ORGS)
+    #write_list(WORK_DIR / "first_party_urls.txt", firsts)
+    seconds = urls_in_orgs(repo_urls, "edx")
+    #write_list(WORK_DIR / "second_party_urls.txt", seconds)
+
+    print("== DONE ==============")
+    print("Second-party:")
+    print("\n".join(seconds))
+
+
+if __name__ == "__main__":
+    main()