diff --git a/.gitignore b/.gitignore index 894d169b79..6a00c44d4b 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ # Ignore the build directory /build +/repo-docs + # Ignore cache /.sass-cache /.cache diff --git a/Gemfile b/Gemfile index 1e853a52f9..8c7bfcf1a7 100644 --- a/Gemfile +++ b/Gemfile @@ -17,6 +17,7 @@ gem "middleman-search_engine_sitemap" gem "git" gem "html-pipeline" gem "mdl" +gem "open3" gem "govuk_publishing_components" gem "govuk_schemas" diff --git a/Gemfile.lock b/Gemfile.lock index 87bd5e5630..bbd33b7bb8 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -321,6 +321,7 @@ GEM octokit (6.1.1) faraday (>= 1, < 3) sawyer (~> 0.9) + open3 (0.1.2) openapi3_parser (0.9.2) commonmarker (~> 0.17) padrino-helpers (0.15.3) @@ -497,6 +498,7 @@ DEPENDENCIES middleman middleman-search_engine_sitemap octokit + open3 rake rspec rubocop-govuk diff --git a/app/github_repo_fetcher.rb b/app/github_repo_fetcher.rb index c49d2d7715..69e3e78b58 100644 --- a/app/github_repo_fetcher.rb +++ b/app/github_repo_fetcher.rb @@ -1,4 +1,5 @@ require "octokit" +require "open3" class GitHubRepoFetcher # The cache is only used locally, as GitHub Pages rebuilds the site @@ -12,6 +13,7 @@ class GitHubRepoFetcher # TODO: we should supply a command line option to automate the # cache clearing step above. LOCAL_CACHE_DURATION = 1.week + REPO_DIR = "#{Bundler.root}/repo-docs/".freeze def self.instance @instance ||= new @@ -39,11 +41,24 @@ def readme(repo_name) def docs(repo_name) return nil if repo(repo_name).private_repo? - CACHE.fetch("alphagov/#{repo_name} docs", expires_in: LOCAL_CACHE_DURATION) do - recursively_fetch_files(repo_name, "docs") - rescue Octokit::NotFound - nil + repo_root = File.join(REPO_DIR, repo_name) + + unless File.exist?(repo_root) + puts "Cloning #{repo_name} docs" + git_commands = [ + ["git", "clone", "-n", "--depth=1", "--filter=tree:0", "https://github.com/alphagov/#{repo_name}", repo_root], + ["git", "sparse-checkout", "set", "--no-cone", "docs", { chdir: repo_root }], + ["git", "checkout", { chdir: repo_root }], + ] + git_commands.each do |command| + _stdout_str, stderr_str, status = Open3.capture3(*command) + raise "Repo docs clone failed with error: #{stderr_str}" unless status.success? + end end + + return nil unless File.exist?(File.join(repo_root, "docs")) + + recursively_fetch_files(repo_name, %w[docs]) end private @@ -62,28 +77,35 @@ def latest_commit(repo_name, path) } end - def recursively_fetch_files(repo_name, path) - docs = client.contents("alphagov/#{repo_name}", path:) - top_level_files = docs.select { |doc| doc.path.end_with?(".md") }.map do |doc| - data_for_github_doc(doc, repo_name) + def recursively_fetch_files(repo_name, path_stack) + repo_dir = Dir["#{File.join(REPO_DIR, repo_name, *path_stack)}/*"] + top_level_files = repo_dir.select { |file_path| File.file?(file_path) && file_path.end_with?(".md") }.map do |doc_path| + data_for_github_doc(doc_path, repo_name) end - docs.select { |doc| doc.type == "dir" }.each_with_object(top_level_files) do |dir, files| - files.concat(recursively_fetch_files(repo_name, dir.path)) + repo_dir.select { |file_path| File.directory?(file_path) }.each_with_object(top_level_files) do |dir, files| + files.concat(recursively_fetch_files(repo_name, path_stack << File.basename(dir))) end end - def data_for_github_doc(doc, repo_name) - contents = HTTP.get(doc.download_url) - docs_path = doc.path.sub("docs/", "").match(/(.+)\..+$/)[1] - filename = docs_path.split("/")[-1] - title = ExternalDoc.title(contents) || filename + def data_for_github_doc(doc_path, repo_name) + contents = File.read(doc_path) + + path = Pathname.new(doc_path) + docs_path_with_extension = path.each_filename.to_a.drop_while { |f| f != "docs" }.drop(1).join("/") + docs_path_without_extension = docs_path_with_extension.chomp(".md") + relative = path.each_filename.to_a.drop_while { |f| f != "docs" }.join("/") + + basename_without_extension = File.basename(doc_path, ".md") + title = ExternalDoc.title(contents) || basename_without_extension + + url = "https://github.com/alphagov/#{repo_name}/blob/main/#{relative}" { - path: "/repos/#{repo_name}/#{docs_path}.html", + path: "/repos/#{repo_name}/#{docs_path_without_extension}.html", title: title.to_s.force_encoding("UTF-8"), markdown: contents.to_s.force_encoding("UTF-8"), - relative_path: doc.path, - source_url: doc.html_url, - latest_commit: latest_commit(repo_name, doc.path), + relative_path: relative, + source_url: url, + latest_commit: latest_commit(repo_name, relative), } end diff --git a/spec/app/github_repo_fetcher_spec.rb b/spec/app/github_repo_fetcher_spec.rb index 425f119502..327d199be9 100644 --- a/spec/app/github_repo_fetcher_spec.rb +++ b/spec/app/github_repo_fetcher_spec.rb @@ -1,5 +1,6 @@ RSpec.describe GitHubRepoFetcher do before :each do + stub_const("GitHubRepoFetcher::REPO_DIR", "#{Bundler.root}/spec/fixtures/repo-docs/".freeze) stub_request(:get, "https://api.github.com/users/alphagov/repos?per_page=100") .to_return( body: "[ { \"name\": \"some-repo\", \"default_branch\": \"main\" } ]", @@ -116,13 +117,9 @@ def readme_url end describe "#docs" do - let(:repo_name) { SecureRandom.uuid } + let(:repo_name) { "some-repo" } let(:commit) { { sha: SecureRandom.hex(40), timestamp: Time.now.utc.to_s } } - def docs_url(repo_name) - "https://api.github.com/repos/alphagov/#{repo_name}/contents/docs" - end - def github_repo_fetcher_returning(repo) instance = GitHubRepoFetcher.new allow(instance).to receive(:repo).with(repo_name).and_return(repo) @@ -133,96 +130,46 @@ def github_repo_fetcher_returning(repo) context "the repo contains a reachable docs/ folder" do let(:expected_hash_structure) { hash_including(:title, :markdown, :path, :relative_path, :source_url, :latest_commit) } - def with_stubbed_client(temporary_client, instance) - before_client = instance.instance_variable_get(:@client) - instance.instance_variable_set(:@client, temporary_client) - yield - instance.instance_variable_set(:@client, before_client) - end - - def stub_doc(contents: "arbitrary contents", path: "docs/foo.md") - doc = double("doc", type: "file", download_url: "foo_url", path:, html_url: "foo_html_url") - allow(HTTP).to receive(:get).with(doc.download_url).and_return(contents) - doc - end - - it "returns an array of hashes" do - instance = github_repo_fetcher_returning(public_repo) - - with_stubbed_client(double("Octokit::Client", contents: [stub_doc]), instance) do - expect(instance.docs(repo_name)).to match([expected_hash_structure]) - end - end - it "derives each document title from its markdown" do instance = github_repo_fetcher_returning(public_repo) - doc = stub_doc(contents: "# title \n Some document") - - with_stubbed_client(double("Octokit::Client", contents: [doc]), instance) do - doc = instance.docs(repo_name).first - expect(doc[:title]).to eq("title") - end + allow(File).to receive(:read).and_return("# title \n Some document") + expect(instance.docs(repo_name).first[:title]).to eq("title") end it "derives document title from its filename if not present in markdown" do instance = github_repo_fetcher_returning(public_repo) - doc = stub_doc(contents: "bar \n Some document") - - with_stubbed_client(double("Octokit::Client", contents: [doc]), instance) do - doc = instance.docs(repo_name).first - expect(doc[:title]).to eq("foo") - end + allow(File).to receive(:read).and_return("bar \n Some document") + expect(instance.docs(repo_name).first[:title]).to eq("foo") end it "maintains the original directory structure" do instance = github_repo_fetcher_returning(public_repo) - doc = stub_doc(path: "docs/subdir/foo.md") - - with_stubbed_client(double("Octokit::Client", contents: [doc]), instance) do - doc = instance.docs(repo_name).first - expect(doc[:path]).to eq("/repos/#{repo_name}/subdir/foo.html") - expect(doc[:relative_path]).to eq("docs/subdir/foo.md") - end + doc = instance.docs(repo_name).second + expect(doc[:path]).to eq("/repos/#{repo_name}/foo/bar.html") + expect(doc[:relative_path]).to eq("docs/foo/bar.md") end it "retrieves documents recursively" do - dir = double("dir", type: "dir", path: "docs/foo") - nested_doc = stub_doc(path: "docs/foo/bar.md") - instance = github_repo_fetcher_returning(public_repo) - allow(HTTP).to receive(:get).with(nested_doc.download_url).and_return("some contents") - stubbed_client = double("Octokit::Client") - allow(stubbed_client).to receive(:contents).with("alphagov/#{repo_name}", path: "docs") - .and_return([dir]) - allow(stubbed_client).to receive(:contents).with("alphagov/#{repo_name}", path: "docs/foo") - .and_return([nested_doc]) - - with_stubbed_client(stubbed_client, instance) do - expect(instance.docs(repo_name)).to match([expected_hash_structure]) - end + expect(instance.docs(repo_name)).to include(expected_hash_structure) end it "skips over any non-markdown files" do instance = github_repo_fetcher_returning(public_repo) - non_markdown_file = double("non markdown file", type: "file", path: "docs/digests.png") - - with_stubbed_client(double("Octokit::Client", contents: [non_markdown_file]), instance) do - expect(instance.docs(repo_name)).to eq([]) - end + allow(Dir).to receive(:[]).and_return(["#{Bundler.root}/spec/fixtures/repo-docs/some-repo/docs/digests.png"]) + expect(instance.docs(repo_name)).to eq([]) end end it "returns nil if no docs folder exists" do instance = github_repo_fetcher_returning(public_repo) - stub_request(:get, docs_url(repo_name)) - .to_return(status: 404, body: "{}", headers: { content_type: "application/json" }) - - expect(instance.docs(repo_name)).to be_nil + allow(File).to receive(:exist?).and_call_original + allow(File).to receive(:exist?).with("#{Bundler.root}/spec/fixtures/repo-docs/some-repo/docs").and_return(false) + expect(instance.docs(repo_name)).to eq(nil) end it "returns nil if the repo is private" do instance = github_repo_fetcher_returning(private_repo) - expect(instance.docs(repo_name)).to eq(nil) end end diff --git a/spec/fixtures/repo-docs/some-repo/docs/foo.md b/spec/fixtures/repo-docs/some-repo/docs/foo.md new file mode 100644 index 0000000000..e074f2fb7b --- /dev/null +++ b/spec/fixtures/repo-docs/some-repo/docs/foo.md @@ -0,0 +1,32 @@ +# Lorem ipsum + +## tl;dr + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam efficitur lacus in +neque consequat porta, [Suspendisse iaculis](#suspendisse-iaculis). Sed et +[inline link](./inline-link.md), imperdiet eros ut, [aliased link]. +[Absolute link](https://nam.com/eget/dui/absolute-link.md) + +[localhost](localhost:999) + +[Relative docs link with period](./docs/prefixed.md) + +[Relative docs link without period](docs/no-prefix.md) + +[Link relative to root](/public/json_examples/requests/foo.json) + +[Subfolder](docs/some-subfolder/foo.md) + +Visit “http://localhost:3108” + +## Suspendisse iaculis + +![Suspendisse iaculis](suspendisse_iaculis.png) + +[aliased link]: ./lib/aliased_link.rb + +## Other headings to test + +### data.gov.uk + +### Patterns & Style Guides diff --git a/spec/fixtures/repo-docs/some-repo/docs/foo/bar.md b/spec/fixtures/repo-docs/some-repo/docs/foo/bar.md new file mode 100644 index 0000000000..e074f2fb7b --- /dev/null +++ b/spec/fixtures/repo-docs/some-repo/docs/foo/bar.md @@ -0,0 +1,32 @@ +# Lorem ipsum + +## tl;dr + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam efficitur lacus in +neque consequat porta, [Suspendisse iaculis](#suspendisse-iaculis). Sed et +[inline link](./inline-link.md), imperdiet eros ut, [aliased link]. +[Absolute link](https://nam.com/eget/dui/absolute-link.md) + +[localhost](localhost:999) + +[Relative docs link with period](./docs/prefixed.md) + +[Relative docs link without period](docs/no-prefix.md) + +[Link relative to root](/public/json_examples/requests/foo.json) + +[Subfolder](docs/some-subfolder/foo.md) + +Visit “http://localhost:3108” + +## Suspendisse iaculis + +![Suspendisse iaculis](suspendisse_iaculis.png) + +[aliased link]: ./lib/aliased_link.rb + +## Other headings to test + +### data.gov.uk + +### Patterns & Style Guides