From 8901aa270d4a566fe336fb13bff9703db2faa0fc Mon Sep 17 00:00:00 2001 From: Phil Rzewski Date: Wed, 27 Nov 2024 08:26:17 -0800 Subject: [PATCH] Scripts and AWS results for perf section of super command doc (#5506) --- .markdownlint.yaml | 2 +- docs/commands/search.sql | 487 --------- docs/commands/super.md | 954 +++++++++++------- scripts/super-cmd-perf/README.md | 106 ++ scripts/super-cmd-perf/benchmark.sh | 97 ++ scripts/super-cmd-perf/prep-data.sh | 58 ++ scripts/super-cmd-perf/queries/agg.sql | 4 + scripts/super-cmd-perf/queries/count.sql | 3 + scripts/super-cmd-perf/queries/search+.spq | 3 + scripts/super-cmd-perf/queries/search+.sql | 489 +++++++++ scripts/super-cmd-perf/queries/search.spq | 3 + scripts/super-cmd-perf/queries/search.sql | 3 + .../queries/union-clickhouse.sql | 13 + .../queries/union-datafusion.sql | 15 + scripts/super-cmd-perf/queries/union.spq | 6 + scripts/super-cmd-perf/queries/union.sql | 13 + scripts/super-cmd-perf/run-queries.sh | 146 +++ 17 files changed, 1562 insertions(+), 840 deletions(-) delete mode 100644 docs/commands/search.sql create mode 100644 scripts/super-cmd-perf/README.md create mode 100755 scripts/super-cmd-perf/benchmark.sh create mode 100755 scripts/super-cmd-perf/prep-data.sh create mode 100644 scripts/super-cmd-perf/queries/agg.sql create mode 100644 scripts/super-cmd-perf/queries/count.sql create mode 100644 scripts/super-cmd-perf/queries/search+.spq create mode 100644 scripts/super-cmd-perf/queries/search+.sql create mode 100644 scripts/super-cmd-perf/queries/search.spq create mode 100644 scripts/super-cmd-perf/queries/search.sql create mode 100644 scripts/super-cmd-perf/queries/union-clickhouse.sql create mode 100644 scripts/super-cmd-perf/queries/union-datafusion.sql create mode 100644 scripts/super-cmd-perf/queries/union.spq create mode 100644 scripts/super-cmd-perf/queries/union.sql create mode 100755 scripts/super-cmd-perf/run-queries.sh diff --git a/.markdownlint.yaml b/.markdownlint.yaml index 3c9b4c92a3..1bdcd8fce8 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -11,4 +11,4 @@ whitespace: true MD010: code_blocks: false # Disallow hard tabs except in code blocks. MD033: - allowed_elements: ["p"] + allowed_elements: ["p","br"] diff --git a/docs/commands/search.sql b/docs/commands/search.sql deleted file mode 100644 index f7465cfa80..0000000000 --- a/docs/commands/search.sql +++ /dev/null @@ -1,487 +0,0 @@ -SELECT count() FROM gha WHERE - id LIKE '%in case you have any feedback 😊%' -OR type LIKE '%in case you have any feedback 😊%' -OR actor.login LIKE '%in case you have any feedback 😊%' -OR actor.display_login LIKE '%in case you have any feedback 😊%' -OR actor.gravatar_id LIKE '%in case you have any feedback 😊%' -OR actor.url LIKE '%in case you have any feedback 😊%' -OR actor.avatar_url LIKE '%in case you have any feedback 😊%' -OR repo.name LIKE '%in case you have any feedback 😊%' -OR repo.url LIKE '%in case you have any feedback 😊%' -OR payload.ref LIKE '%in case you have any feedback 😊%' -OR payload.ref_type LIKE '%in case you have any feedback 😊%' -OR payload.pusher_type LIKE '%in case you have any feedback 😊%' -OR payload.head LIKE '%in case you have any feedback 😊%' -OR payload.before LIKE '%in case you have any feedback 😊%' -OR payload.master_branch LIKE '%in case you have any feedback 😊%' -OR payload.description LIKE '%in case you have any feedback 😊%' -OR payload.action LIKE '%in case you have any feedback 😊%' -OR org.login LIKE '%in case you have any feedback 😊%' -OR org.gravatar_id LIKE '%in case you have any feedback 😊%' -OR org.url LIKE '%in case you have any feedback 😊%' -OR org.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.review.node_id LIKE '%in case you have any feedback 😊%' -OR payload.review.user.login LIKE '%in case you have any feedback 😊%' -OR payload.review.user.node_id LIKE '%in case you have any feedback 😊%' -OR payload.review.user.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.review.user.url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.html_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.following_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.events_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.review.user.type LIKE '%in case you have any feedback 😊%' -OR payload.review.body LIKE '%in case you have any feedback 😊%' -OR payload.review.commit_id LIKE '%in case you have any feedback 😊%' -OR payload.review.state LIKE '%in case you have any feedback 😊%' -OR payload.review.html_url LIKE '%in case you have any feedback 😊%' -OR payload.review.pull_request_url LIKE '%in case you have any feedback 😊%' -OR payload.review.author_association LIKE '%in case you have any feedback 😊%' -OR payload.review._links.html.href LIKE '%in case you have any feedback 😊%' -OR payload.review._links.pull_request.href LIKE '%in case you have any feedback 😊%' -OR payload.comment.url LIKE '%in case you have any feedback 😊%' -OR payload.comment.html_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.node_id LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.login LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.node_id LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.html_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.following_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.events_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.user.type LIKE '%in case you have any feedback 😊%' -OR payload.comment.path LIKE '%in case you have any feedback 😊%' -OR payload.comment.commit_id LIKE '%in case you have any feedback 😊%' -OR payload.comment.author_association LIKE '%in case you have any feedback 😊%' -OR payload.comment.body LIKE '%in case you have any feedback 😊%' -OR payload.comment.reactions.url LIKE '%in case you have any feedback 😊%' -OR payload.comment.issue_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.diff_hunk LIKE '%in case you have any feedback 😊%' -OR payload.comment.original_commit_id LIKE '%in case you have any feedback 😊%' -OR payload.comment.pull_request_url LIKE '%in case you have any feedback 😊%' -OR payload.comment.start_side LIKE '%in case you have any feedback 😊%' -OR payload.comment.side LIKE '%in case you have any feedback 😊%' -OR payload.issue.url LIKE '%in case you have any feedback 😊%' -OR payload.issue.repository_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.labels_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.comments_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.events_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.html_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.node_id LIKE '%in case you have any feedback 😊%' -OR payload.issue.title LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.login LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.node_id LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.html_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.following_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.events_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.user.type LIKE '%in case you have any feedback 😊%' -OR payload.issue.state LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.login LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.node_id LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.html_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.following_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.events_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.assignee.type LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.html_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.labels_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.node_id LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.title LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.description LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.login LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.node_id LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.html_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.following_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.events_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.creator.type LIKE '%in case you have any feedback 😊%' -OR payload.issue.milestone.state LIKE '%in case you have any feedback 😊%' -OR payload.issue.author_association LIKE '%in case you have any feedback 😊%' -OR payload.issue.active_lock_reason LIKE '%in case you have any feedback 😊%' -OR payload.issue.body LIKE '%in case you have any feedback 😊%' -OR payload.issue.reactions.url LIKE '%in case you have any feedback 😊%' -OR payload.issue.timeline_url LIKE '%in case you have any feedback 😊%' -OR payload.issue.state_reason LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.diff_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.patch_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.issue_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.state LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.title LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.login LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.following_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.user.type LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.body LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merge_commit_sha LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.commits_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.review_comments_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.review_comment_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.comments_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.statuses_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.label LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.ref LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.sha LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.login LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.following_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.user.type LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.name LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.full_name LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.login LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.following_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.owner.type LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.description LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.forks_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.keys_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.collaborators_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.teams_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.hooks_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.issue_events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.assignees_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.branches_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.tags_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.blobs_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.git_tags_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.git_refs_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.trees_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.statuses_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.languages_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.stargazers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.contributors_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.subscribers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.subscription_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.commits_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.git_commits_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.comments_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.issue_comment_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.contents_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.compare_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.merges_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.archive_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.downloads_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.issues_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.pulls_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.milestones_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.notifications_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.labels_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.releases_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.deployments_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.git_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.ssh_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.clone_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.svn_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.homepage LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.language LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.mirror_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.visibility LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.head.repo.default_branch LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.label LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.ref LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.sha LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.login LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.following_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.user.type LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.name LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.full_name LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.login LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.following_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.owner.type LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.description LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.forks_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.keys_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.collaborators_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.teams_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.hooks_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.issue_events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.assignees_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.branches_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.tags_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.blobs_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.git_tags_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.git_refs_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.trees_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.statuses_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.languages_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.stargazers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.contributors_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.subscribers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.subscription_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.commits_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.git_commits_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.comments_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.issue_comment_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.contents_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.compare_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.merges_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.archive_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.downloads_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.issues_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.pulls_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.milestones_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.notifications_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.labels_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.releases_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.deployments_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.git_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.ssh_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.clone_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.svn_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.homepage LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.language LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.mirror_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.visibility LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.base.repo.default_branch LIKE '%in case you have any feedback 😊%' -OR payload.pull_request._links.self.href LIKE '%in case you have any feedback 😊%' -OR payload.pull_request._links.html.href LIKE '%in case you have any feedback 😊%' -OR payload.pull_request._links.issue.href LIKE '%in case you have any feedback 😊%' -OR payload.pull_request._links.comments.href LIKE '%in case you have any feedback 😊%' -OR payload.pull_request._links.review_comments.href LIKE '%in case you have any feedback 😊%' -OR payload.pull_request._links.review_comment.href LIKE '%in case you have any feedback 😊%' -OR payload.pull_request._links.commits.href LIKE '%in case you have any feedback 😊%' -OR payload.pull_request._links.statuses.href LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.author_association LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.active_lock_reason LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.mergeable_state LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.login LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.node_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.html_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.following_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.pull_request.merged_by.type LIKE '%in case you have any feedback 😊%' -OR payload.forkee.node_id LIKE '%in case you have any feedback 😊%' -OR payload.forkee.name LIKE '%in case you have any feedback 😊%' -OR payload.forkee.full_name LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.login LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.node_id LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.html_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.following_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.events_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.owner.type LIKE '%in case you have any feedback 😊%' -OR payload.forkee.html_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.description LIKE '%in case you have any feedback 😊%' -OR payload.forkee.url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.forks_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.keys_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.collaborators_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.teams_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.hooks_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.issue_events_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.events_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.assignees_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.branches_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.tags_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.blobs_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.git_tags_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.git_refs_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.trees_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.statuses_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.languages_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.stargazers_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.contributors_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.subscribers_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.subscription_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.commits_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.git_commits_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.comments_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.issue_comment_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.contents_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.compare_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.merges_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.archive_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.downloads_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.issues_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.pulls_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.milestones_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.notifications_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.labels_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.releases_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.deployments_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.git_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.ssh_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.clone_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.svn_url LIKE '%in case you have any feedback 😊%' -OR payload.forkee.homepage LIKE '%in case you have any feedback 😊%' -OR payload.forkee.visibility LIKE '%in case you have any feedback 😊%' -OR payload.forkee.default_branch LIKE '%in case you have any feedback 😊%' -OR payload.release.url LIKE '%in case you have any feedback 😊%' -OR payload.release.assets_url LIKE '%in case you have any feedback 😊%' -OR payload.release.upload_url LIKE '%in case you have any feedback 😊%' -OR payload.release.html_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.login LIKE '%in case you have any feedback 😊%' -OR payload.release.author.node_id LIKE '%in case you have any feedback 😊%' -OR payload.release.author.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.release.author.url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.html_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.following_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.events_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.release.author.type LIKE '%in case you have any feedback 😊%' -OR payload.release.node_id LIKE '%in case you have any feedback 😊%' -OR payload.release.tag_name LIKE '%in case you have any feedback 😊%' -OR payload.release.target_commitish LIKE '%in case you have any feedback 😊%' -OR payload.release.name LIKE '%in case you have any feedback 😊%' -OR payload.release.tarball_url LIKE '%in case you have any feedback 😊%' -OR payload.release.zipball_url LIKE '%in case you have any feedback 😊%' -OR payload.release.body LIKE '%in case you have any feedback 😊%' -OR payload.release.short_description_html LIKE '%in case you have any feedback 😊%' -OR payload.release.discussion_url LIKE '%in case you have any feedback 😊%' -OR payload.member.login LIKE '%in case you have any feedback 😊%' -OR payload.member.node_id LIKE '%in case you have any feedback 😊%' -OR payload.member.avatar_url LIKE '%in case you have any feedback 😊%' -OR payload.member.gravatar_id LIKE '%in case you have any feedback 😊%' -OR payload.member.url LIKE '%in case you have any feedback 😊%' -OR payload.member.html_url LIKE '%in case you have any feedback 😊%' -OR payload.member.followers_url LIKE '%in case you have any feedback 😊%' -OR payload.member.following_url LIKE '%in case you have any feedback 😊%' -OR payload.member.gists_url LIKE '%in case you have any feedback 😊%' -OR payload.member.starred_url LIKE '%in case you have any feedback 😊%' -OR payload.member.subscriptions_url LIKE '%in case you have any feedback 😊%' -OR payload.member.organizations_url LIKE '%in case you have any feedback 😊%' -OR payload.member.repos_url LIKE '%in case you have any feedback 😊%' -OR payload.member.events_url LIKE '%in case you have any feedback 😊%' -OR payload.member.received_events_url LIKE '%in case you have any feedback 😊%' -OR payload.member.type LIKE '%in case you have any feedback 😊%' diff --git a/docs/commands/super.md b/docs/commands/super.md index 31b960b1e0..e7c72df485 100644 --- a/docs/commands/super.md +++ b/docs/commands/super.md @@ -663,20 +663,34 @@ the `super` command, but it turns out that `super` can hold its own when compared to other analytics systems. To illustrate comparative performance, we'll present some informal performance -measurements among `super`, -[`DuckDB`](https://duckdb.org/), -[`ClickHouse`](https://clickhouse.com/), and -[`DataFusion`](https://datafusion.apache.org/). +measurements among SuperDB, +[DuckDB](https://duckdb.org/), +[ClickHouse](https://clickhouse.com/), and +[DataFusion](https://datafusion.apache.org/). We'll use the Parquet format to compare apples to apples and also report results for the custom columnar database format of DuckDB -and the Super Binary format used by `super`. -We tried loading our test data into a ClickHouse table using its -[new experimental JSON type](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse) -but those attempts failed with "too many open files". +and the [Super Binary](../formats/bsup.md) format used by `super`. -As of this writing in November 2024, we're using the latest version 1.1.3 of `duckdb`. -version 24.11.1.1393 of `clickhouse`, and v43.0.0 of `datafusion-cli`. +We also experimented with loading our test data into a ClickHouse table using its +[new beta JSON type](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse). +Preliminary results showed a mix of good performance and failed queries. +We'll provide more detail on these tests soon. + +The detailed steps shown [below](#appendix-2-running-the-tests) can be reproduced via +[automated scripts](https://github.com/brimdata/super/blob/main/scripts/super-cmd-perf). +As of this writing in November 2024, [results](#the-test-results) were gathered on an AWS +[`m6idn.2xlarge`](https://aws.amazon.com/ec2/instance-types/m6i/) instance +with the following software versions: + +|**Software**|**Version**| +|-|-| +|`super`|Commit `31760cd`| +|`duckdb`|`v1.1.3` 19864453f7| +|`datafusion-cli`|datafusion-cli `43.0.0`| +|`clickhouse`|ClickHouse local version `24.10.3.21` (official build)| + +The complete run logs are [archived here](https://super-cmd-perf.s3.us-east-2.amazonaws.com/2024-11-26_03-17-25.tgz). ### The Test Data @@ -694,7 +708,7 @@ wget https://data.gharchive.org/2023-02-08-1.json.gz wget https://data.gharchive.org/2023-02-08-23.json.gz ``` We downloaded these files into a directory called `gharchive_gz` -and created a duckdb database file called `gha.db` and a table called `gha` +and created a DuckDB database file called `gha.db` and a table called `gha` using this command: ``` duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', union_by_name=true)" @@ -708,15 +722,15 @@ We then created a Parquet file called `gha.parquet` with this command: duckdb gha.db -c "COPY (from gha) TO 'gha.parquet'" ``` To create a super-structed file for the `super` command, there is no need to -fuse the data into a single schema (though `super` can still work with the fused +[`fuse`](../language/operators/fuse.md) the data into a single schema (though `super` can still work with the fused schema in the Parquet file), and we simply ran this command to create a Super Binary file: ``` super gharchive_gz/*.json.gz > gha.bsup ``` This code path in `super` is not multi-threaded so not particularly performant but, -on our test machine, it takes about the same time as the `duckdb` method of creating -a schema-fused table. +on our test machine, this runs more than 2x faster than the `duckdb` method of +creating a schema-fused table. Here are the resulting file sizes: ``` @@ -735,11 +749,11 @@ The test queries involve these patterns: * count by field aggregation * rank over union of disparate field types -We will call these tests `search`, `search+`, `count`, `agg`, and `union`, respectively +We will call these tests [search](#search), [search+](#search-1), [count](#count), [agg](#agg), and [union](#union), respectively #### Search -For the search test, we'll search for the string pattern +For the _search_ test, we'll search for the string pattern ``` "in case you have any feedback 😊" ``` @@ -754,9 +768,10 @@ SELECT count() FROM 'gha.parquet' -- or gha WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' ``` -SuperSQL has a function called `grep` that is similar to the SQL `LIKE` clause but -can operate over specified fields or default to all the string fields in any value. -The SuperSQL query is +SuperSQL supports `LIKE` and could run this plain SQL query, but it also has a +similar function called [`grep`](../language/functions/grep.md) that can operate over specified fields or +default to all the string fields in any value. The SuperSQL query that uses +`grep` is ```sql SELECT count() FROM 'gha.bsup' @@ -778,14 +793,21 @@ WHERE id LIKE '%in case you have any feedback 😊%' OR payload.member.type LIKE '%in case you have any feedback 😊%' ``` There are 486 such fields. You can review the entire query in -[docs/commands/search.sql](search.sql). +[`search+.sql`](https://github.com/brimdata/super/blob/main/scripts/super-cmd-perf/search%2B.sql). + +In SuperSQL, `grep` allows for a much shorter query. +```sql +SELLECT count() +FROM 'gha.bsup' +WHERE grep('in case you have any feedback 😊') +``` #### Count -In the `count` test, we filter the input with a WHERE clause and count the results. +In the _count_ test, we filter the input with a `WHERE` clause and count the results. We chose a random GitHub user name for the filter. This query has the form: -``` +```sql SELECT count() FROM 'gha.parquet' -- or gha or 'gha.bsup' WHERE actor.login='johnbieren'" @@ -793,10 +815,10 @@ WHERE actor.login='johnbieren'" #### Agg -In the `agg` test, we filter the input and count the results grouped by the field `type` +In the _agg_ test, we filter the input and count the results grouped by the field `type` as in the DuckDB blog. This query has the form: -``` +```sql SELECT count(),type FROM 'gha.parquet' -- or 'gha' or 'gha.bsup' WHERE repo.name='duckdb/duckdb' @@ -805,7 +827,7 @@ GROUP BY type #### Union -The `union` test is straight out of the DuckDB blog at the end of +The _union_ test is straight out of the DuckDB blog at the end of [this section](https://duckdb.org/2023/03/03/json.html#handling-inconsistent-json-schemas). This query computes the GitHub users that were assigned as a PR reviewer the most often and returns the top 5 such users. @@ -818,10 +840,10 @@ This query is: ```sql WITH assignees AS ( SELECT payload.pull_request.assignee.login assignee - FROM 'gha.parquet' + FROM 'gha.parquet' -- or 'gha' UNION ALL SELECT unnest(payload.pull_request.assignees).login assignee - FROM 'gha.parquet' + FROM 'gha.parquet' -- or 'gha' ) SELECT assignee, count(*) count FROM assignees @@ -837,19 +859,19 @@ FROM 'gha.parquet' ``` as ```sql -SELECT rec.login as assignee FROM ( - SELECT unnest(payload.pull_request.assignees) rec +SELECT object.login as assignee FROM ( + SELECT unnest(payload.pull_request.assignees) object FROM 'gha.parquet' ) ``` and for ClickHouse, we had to use `arrayJoin` instead of `unnest`. -SuperSQL's data model does not require these sorts of gymnastics as +SuperSQL's data model does not require these kinds of gymnastics as everything does not have to be jammed into a table. Instead, we can use the -`UNNEST` pipe operator combined with the spread operator applied to the array of +`UNNEST` pipe operator combined with the [spread operator](../language/expressions.md#array-expressions) applied to the array of string fields to easily produce a stream of string values representing the assignees. Then we simply aggregate the assignee stream: -``` +```sql FROM 'gha.bsup' | UNNEST [...payload.pull_request.assignees, payload.pull_request.assignee] | WHERE this IS NOT NULL @@ -860,30 +882,32 @@ FROM 'gha.bsup' ### The Test Results -The following table summarizes the results of each test as a column and -each tool as a row with the speed-up factor shown in parentheses: +The following table summarizes the query performance for each tool as recorded in the +[most recent archived run](https://super-cmd-perf.s3.us-east-2.amazonaws.com/2024-11-26_03-17-25.tgz). +The run time for each query in seconds is shown along with the speed-up factor +in parentheses: -| tool | format | search | search+ | count | agg | union | -|--------------|---------------|---------------|---------------|----|------|-------| -| `super` | `bsup` | 3.2 (2.6X) | 6.7 (3.6X) | 3.2 (0.04X) | 3.1 (0.04X) | 3.8 (117X) | -| `super` | `parquet` | note 1 | note 1 | 0.18 (0.7X) | 0.27 (0.4X) | note 2 | -| `duckdb` | `db` | 8.2 | 24 | 0.13 | 0.12 | 446 | -| `duckdb` | `parquet` | 8.4 (1) | 23 (1X) | 0.26 (0.5X) | 0.21 (0.6X) | 419 (1.1X) | -| `datafusion` | `parquet` | 9.1 (0.9X) | 18 (1.3X) | 0.24 (0.5X) | 0.24 (0.5X) | 40 (11x) | -| `clickhouse` | `parquet` | 56 (0.1X) | 463 (0.1X) | 1 (0.1X) | 0.91 (0.1X) | 66 (7X) | +|**Tool**|**Format**|**search**|**search+**|**count**|**agg**|**union**| +|-|-|-|-|-|-|-| +|`super`|`bsup`|6.3
(1.9x)|14.3
(1.4x)|5.7
(0.03x)|5.6
(0.03x)|8.2
(63x)| +|`super`|`parquet`|note 1|note 1|0.3
(0.6x)|0.5
(0.3x)|note 2| +|`duckdb`|`db`|12.1|19.8|0.2|0.14|519| +|`duckdb`|`parquet`|12.9
(0.9x)|21.2
(0.9x)|0.4
(0.4x)|0.3
(0.5x)|499
(1x)| +|`datafusion`|`parquet`|11.1
(1.1x)|21.1
(0.9x)|0.4
(0.5x)|0.4
(0.4x)|24.3
(21x)| +|`clickhouse`|`parquet`|68
(0.2x)|845
(0.02x)|1
(0.2x)|0.9
(0.2x)|70
(7x)| _Note 1: the `super` vectorized runtime does not yet support `grep`_ _Note 2: the `super` vectorized runtime does not yet support array expressions_ Since DuckDB with its native format is overall the best performing, -we used it as the baseline for all of the speedup factors. +we used it as the baseline for all of the speed-up factors. To summarize, `super` with Super Binary is substantially faster than the relational systems for the search use cases and performs on par with the others for traditional OLAP queries, -except for the union query, where the super-structured data model trounces the relational -model (by over 100X!) for stitching together disparate data types for analysis in an aggregation. +except for the _union_ query, where the super-structured data model trounces the relational +model (by over 60x!) for stitching together disparate data types for analysis in an aggregation. ## Appendix 1: Preparing the Test Data @@ -927,16 +951,16 @@ FROM ( Hmm, now `duckdb` runs out of memory. We then thought we'd see if the sampling algorithm of `read_json` is more efficient, -so we ran tried this command with successively larger sample sizes: +so we tried this command with successively larger sample sizes: ``` duckdb scratch -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', sample_size=1000000)" ``` -even with a million rows as the sample, `duckdb` fails with +Even with a million rows as the sample, `duckdb` fails with ``` Invalid Input Error: JSON transform error in file "gharchive_gz/2023-02-08-14.json.gz", in line 49745: Object {"issues":"write","metadata":"read","pull_requests... has unknown key "repository_hooks" Try increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or 'records' manually, setting 'ignore_errors' to true, or setting 'union_by_name' to true when reading multiple files with a different structure. ``` -Ok, there 4434953 JSON objects in the input so let's try this: +Ok, there 4,434,953 JSON objects in the input so let's try this: ``` duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', sample_size=4434953)" ``` @@ -951,344 +975,595 @@ Sure enough, this works: ``` duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', union_by_name=true)" ``` -We now have the `duckdb` database file for our GitHub Archive data called `gha.db` +We now have the DuckDB database file for our GitHub Archive data called `gha.db` containing a single table called `gha` embedded in that database. What about the super-structured format for the `super` command? There is no need to futz with sample sizes, -schema inference, or union by name, just run this to create a Super Binary file: +schema inference, or union by name. Just run this to create a Super Binary file: ``` super gharchive_gz/*.json.gz > gha.bsup ``` ## Appendix 2: Running the Tests -This appendix provides the raw tests and output that we run on a MacBook Pro to generate -the table of results above. +This appendix provides the raw tests and output from the [most recent archived run](https://super-cmd-perf.s3.us-east-2.amazonaws.com/2024-11-26_03-17-25.tgz) +of the tests via [automated scripts](https://github.com/brimdata/super/blob/main/scripts/super-cmd-perf) +on an AWS [`m6idn.2xlarge`](https://aws.amazon.com/ec2/instance-types/m6i/) instance. ### Search Test ``` -; time super -c " - SELECT count() - FROM 'gha.bsup' - WHERE grep('in case you have any feedback 😊', payload.pull_request.body) -" -{count:2(uint64)} -super -c 12.70s user 0.69s system 415% cpu 3.223 total +About to execute +================ +clickhouse --queries-file /mnt/tmpdir/tmp.0REdlePG3O + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' -time duckdb gha.db -c " - SELECT count() - FROM gha - WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' -" ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'clickhouse --queries-file /mnt/tmpdir/tmp.0REdlePG3O' +Benchmark 1: clickhouse --queries-file /mnt/tmpdir/tmp.0REdlePG3O +2 + Time (abs ≡): 68.250 s [User: 67.960 s, System: 3.333 s] + +About to execute +================ +datafusion-cli --file /mnt/tmpdir/tmp.TO5M8YolwM + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'datafusion-cli --file /mnt/tmpdir/tmp.TO5M8YolwM' +Benchmark 1: datafusion-cli --file /mnt/tmpdir/tmp.TO5M8YolwM +DataFusion CLI v43.0.0 ++---------+ +| count() | ++---------+ +| 2 | ++---------+ +1 row(s) fetched. +Elapsed 10.942 seconds. + + Time (abs ≡): 11.130 s [User: 65.904 s, System: 11.389 s] + +About to execute +================ +duckdb /mnt/gha.db < /mnt/tmpdir/tmp.GxPkGbQK8Y + +With query +========== +SELECT count() +FROM 'gha' +WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb /mnt/gha.db < /mnt/tmpdir/tmp.GxPkGbQK8Y' +Benchmark 1: duckdb /mnt/gha.db < /mnt/tmpdir/tmp.GxPkGbQK8Y ┌──────────────┐ │ count_star() │ │ int64 │ ├──────────────┤ │ 2 │ └──────────────┘ -duckdb gha.db -c 26.66s user 6.90s system 406% cpu 8.266 total + Time (abs ≡): 12.142 s [User: 81.132 s, System: 8.426 s] + +About to execute +================ +duckdb < /mnt/tmpdir/tmp.EtIuukLt2w + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' -; time duckdb -c " - SELECT count() - FROM gha.parquet - WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' -" ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb < /mnt/tmpdir/tmp.EtIuukLt2w' +Benchmark 1: duckdb < /mnt/tmpdir/tmp.EtIuukLt2w ┌──────────────┐ │ count_star() │ │ int64 │ ├──────────────┤ │ 2 │ └──────────────┘ -duckdb -c 42.71s user 6.06s system 582% cpu 8.380 total + Time (abs ≡): 12.890 s [User: 86.998 s, System: 6.305 s] + +About to execute +================ +super -z -I /mnt/tmpdir/tmp.JfLwNNwBeG + +With query +========== +SELECT count() +FROM '/mnt/gha.bsup' +WHERE grep('in case you have any feedback 😊', payload.pull_request.body) + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'super -z -I /mnt/tmpdir/tmp.JfLwNNwBeG' +Benchmark 1: super -z -I /mnt/tmpdir/tmp.JfLwNNwBeG +{count:2(uint64)} + Time (abs ≡): 6.325 s [User: 23.018 s, System: 1.652 s] +``` +### Search+ Test + +``` +About to execute +================ +clickhouse --queries-file /mnt/tmpdir/tmp.QRmnp0x8FT + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE + id LIKE '%in case you have any feedback 😊%' + OR type LIKE '%in case you have any feedback 😊%' +... + OR payload.member.type LIKE '%in case you have any feedback 😊%' -; time datafusion-cli -c " - SELECT count() - FROM 'gha.parquet' - WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' -" ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'clickhouse --queries-file /mnt/tmpdir/tmp.QRmnp0x8FT' +Benchmark 1: clickhouse --queries-file /mnt/tmpdir/tmp.QRmnp0x8FT +3 + Time (abs ≡): 844.648 s [User: 923.669 s, System: 19.038 s] + +About to execute +================ +datafusion-cli --file /mnt/tmpdir/tmp.NZ6GD2NGSD + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE + id LIKE '%in case you have any feedback 😊%' + OR type LIKE '%in case you have any feedback 😊%' + ... + OR payload.member.type LIKE '%in case you have any feedback 😊%' + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'datafusion-cli --file /mnt/tmpdir/tmp.NZ6GD2NGSD' +Benchmark 1: datafusion-cli --file /mnt/tmpdir/tmp.NZ6GD2NGSD DataFusion CLI v43.0.0 +---------+ | count() | +---------+ -| 2 | +| 3 | +---------+ -1 row(s) fetched. -Elapsed 8.819 seconds. +1 row(s) fetched. +Elapsed 20.913 seconds. -datafusion-cli -c 40.75s user 6.72s system 521% cpu 9.106 total + Time (abs ≡): 21.127 s [User: 126.933 s, System: 19.620 s] + +About to execute +================ +duckdb /mnt/gha.db < /mnt/tmpdir/tmp.fmZ4sHQJOv -; time clickhouse -q " - SELECT count() - FROM 'gha.parquet' - WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' -" -2 -clickhouse -q 50.81s user 1.83s system 94% cpu 55.994 total -``` - -### Search+ Test - -``` -; time super -c " - SELECT count() - FROM 'gha.bsup' - WHERE grep('in case you have any feedback 😊') -" -{count:3(uint64)} -super -c 43.80s user 0.71s system 669% cpu 6.653 total - -; time duckdb gha.db < search.sql +With query +========== +SELECT count() +FROM 'gha' +WHERE + id LIKE '%in case you have any feedback 😊%' + OR type LIKE '%in case you have any feedback 😊%' + ... + OR payload.member.type LIKE '%in case you have any feedback 😊%' + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb /mnt/gha.db < /mnt/tmpdir/tmp.fmZ4sHQJOv' +Benchmark 1: duckdb /mnt/gha.db < /mnt/tmpdir/tmp.fmZ4sHQJOv ┌──────────────┐ │ count_star() │ │ int64 │ ├──────────────┤ │ 3 │ └──────────────┘ -duckdb gha.db < search.sql 73.60s user 33.29s system 435% cpu 24.563 total - -; time duckdb < search-parquet.sql + Time (abs ≡): 19.796 s [User: 140.238 s, System: 9.686 s] + +About to execute +================ +duckdb < /mnt/tmpdir/tmp.hE8ZzAlSRQ + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE + id LIKE '%in case you have any feedback 😊%' + OR type LIKE '%in case you have any feedback 😊%' + ... + OR payload.member.type LIKE '%in case you have any feedback 😊%' + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb < /mnt/tmpdir/tmp.hE8ZzAlSRQ' +Benchmark 1: duckdb < /mnt/tmpdir/tmp.hE8ZzAlSRQ ┌──────────────┐ │ count_star() │ │ int64 │ ├──────────────┤ │ 3 │ └──────────────┘ -duckdb < search-parquet.sql 89.57s user 29.21s system 513% cpu 23.113 total + Time (abs ≡): 21.210 s [User: 143.903 s, System: 9.179 s] + +About to execute +================ +super -z -I /mnt/tmpdir/tmp.ncLqBOUkXD + +With query +========== +SELECT count() +FROM '/mnt/gha.bsup' +WHERE grep('in case you have any feedback 😊') + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'super -z -I /mnt/tmpdir/tmp.ncLqBOUkXD' +Benchmark 1: super -z -I /mnt/tmpdir/tmp.ncLqBOUkXD +{count:3(uint64)} + Time (abs ≡): 14.267 s [User: 105.204 s, System: 1.698 s] +``` -; time datafusion-cli -f search-parquet.sql +### Count Test + +``` +About to execute +================ +clickhouse --queries-file /mnt/tmpdir/tmp.xpShnx3ftw + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE actor.login='johnbieren' + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'clickhouse --queries-file /mnt/tmpdir/tmp.xpShnx3ftw' +Benchmark 1: clickhouse --queries-file /mnt/tmpdir/tmp.xpShnx3ftw +879 + Time (abs ≡): 1.034 s [User: 0.822 s, System: 0.233 s] + +About to execute +================ +datafusion-cli --file /mnt/tmpdir/tmp.eO6Lt0jBbs + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE actor.login='johnbieren' + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'datafusion-cli --file /mnt/tmpdir/tmp.eO6Lt0jBbs' +Benchmark 1: datafusion-cli --file /mnt/tmpdir/tmp.eO6Lt0jBbs DataFusion CLI v43.0.0 +---------+ | count() | +---------+ -| 3 | +| 879 | +---------+ -1 row(s) fetched. -Elapsed 18.184 seconds. -datafusion-cli -f search-parquet.sql 83.84s user 11.13s system 513% cpu 18.494 total +1 row(s) fetched. +Elapsed 0.340 seconds. -; time clickhouse --queries-file search-parquet.sql -3 -clickhouse --queries-file search-parquet.sql 515.68s user 5.50s system 112% cpu 7:43.37 total -``` -### Count Test + Time (abs ≡): 0.381 s [User: 1.578 s, System: 0.411 s] + +About to execute +================ +duckdb /mnt/gha.db < /mnt/tmpdir/tmp.dEgCWl2Iem -``` -; time super -c " - SELECT count() - FROM 'gha.bsup' - WHERE actor.login='johnbieren' -" -{count:879(uint64)} -super -c 13.81s user 0.71s system 449% cpu 3.233 total - -; time SUPER_VAM=1 super -c " - SELECT count() - FROM 'gha.parquet' - WHERE actor.login='johnbieren' -" -{count:879(uint64)} -SUPER_VAM=1 super -c 0.43s user 0.08s system 277% cpu 0.182 total +With query +========== +SELECT count() +FROM 'gha' +WHERE actor.login='johnbieren' -; time duckdb gha.db -c " - SELECT count() - FROM gha - WHERE actor.login='johnbieren' -" ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb /mnt/gha.db < /mnt/tmpdir/tmp.dEgCWl2Iem' +Benchmark 1: duckdb /mnt/gha.db < /mnt/tmpdir/tmp.dEgCWl2Iem ┌──────────────┐ │ count_star() │ │ int64 │ ├──────────────┤ │ 879 │ └──────────────┘ -duckdb gha.db -c 0.64s user 0.06s system 517% cpu 0.134 total + Time (abs ≡): 0.175 s [User: 1.042 s, System: 0.106 s] + +About to execute +================ +duckdb < /mnt/tmpdir/tmp.GNGEkrs6IU + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE actor.login='johnbieren' -; time duckdb -c " - SELECT count() - FROM 'gha.parquet' - WHERE actor.login='johnbieren' -" ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb < /mnt/tmpdir/tmp.GNGEkrs6IU' +Benchmark 1: duckdb < /mnt/tmpdir/tmp.GNGEkrs6IU ┌──────────────┐ │ count_star() │ │ int64 │ ├──────────────┤ │ 879 │ └──────────────┘ -duckdb gha.db -c 1.14s user 0.14s system 490% cpu 0.261 total - -DataFusion CLI v43.0.0 -+---------+ -| count() | -+---------+ -| 879 | -+---------+ -1 row(s) fetched. -Elapsed 0.203 seconds. + Time (abs ≡): 0.423 s [User: 2.256 s, System: 0.181 s] + +About to execute +================ +super -z -I /mnt/tmpdir/tmp.dyu0120H2m + +With query +========== +SELECT count() +FROM '/mnt/gha.bsup' +WHERE actor.login='johnbieren' -datafusion-cli -c 0.93s user 0.15s system 453% cpu 0.238 total ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'super -z -I /mnt/tmpdir/tmp.dyu0120H2m' +Benchmark 1: super -z -I /mnt/tmpdir/tmp.dyu0120H2m +{count:879(uint64)} + Time (abs ≡): 5.745 s [User: 17.240 s, System: 1.509 s] + +About to execute +================ +SUPER_VAM=1 super -z -I /mnt/tmpdir/tmp.yogAuyCHWe + +With query +========== +SELECT count() +FROM '/mnt/gha.parquet' +WHERE actor.login='johnbieren' -; time clickhouse -q " - SELECT count() - FROM 'gha.parquet' - WHERE actor.login='johnbieren' -" -879 -clickhouse -q 0.86s user 0.07s system 93% cpu 1.001 total ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'SUPER_VAM=1 super -z -I /mnt/tmpdir/tmp.yogAuyCHWe' +Benchmark 1: SUPER_VAM=1 super -z -I /mnt/tmpdir/tmp.yogAuyCHWe +{count:879(uint64)} + Time (abs ≡): 0.298 s [User: 0.769 s, System: 0.248 s] ``` ### Agg Test ``` -; time super -c " - SELECT count(),type - FROM 'gha.bsup' - WHERE repo.name='duckdb/duckdb' - GROUP BY type -" -{type:"PullRequestReviewEvent",count:14(uint64)} -{type:"IssueCommentEvent",count:30(uint64)} -{type:"WatchEvent",count:29(uint64)} -{type:"PullRequestEvent",count:35(uint64)} -{type:"PushEvent",count:15(uint64)} -{type:"IssuesEvent",count:9(uint64)} -{type:"ForkEvent",count:3(uint64)} -{type:"PullRequestReviewCommentEvent",count:7(uint64)} -super -c 12.24s user 0.68s system 413% cpu 3.129 total - -; time SUPER_VAM=1 super -c " - SELECT count(),type - FROM 'gha.parquet' - WHERE repo.name='duckdb/duckdb' - GROUP BY type -" -{type:"IssueCommentEvent",count:30(uint64)} -{type:"PullRequestEvent",count:35(uint64)} -{type:"PushEvent",count:15(uint64)} -{type:"WatchEvent",count:29(uint64)} -{type:"PullRequestReviewEvent",count:14(uint64)} -{type:"ForkEvent",count:3(uint64)} -{type:"PullRequestReviewCommentEvent",count:7(uint64)} -{type:"IssuesEvent",count:9(uint64)} -SUPER_VAM=1 super -c 1.01s user 0.13s system 421% cpu 0.271 total - -; time duckdb gha.db -c " - SELECT count(),type - FROM gha - WHERE repo.name='duckdb/duckdb' - GROUP BY type -" +About to execute +================ +clickhouse --queries-file /mnt/tmpdir/tmp.FNKcK1lhGU + +With query +========== +SELECT count(),type +FROM '/mnt/gha.parquet' +WHERE repo.name='duckdb/duckdb' +GROUP BY type + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'clickhouse --queries-file /mnt/tmpdir/tmp.FNKcK1lhGU' +Benchmark 1: clickhouse --queries-file /mnt/tmpdir/tmp.FNKcK1lhGU +30 IssueCommentEvent +14 PullRequestReviewEvent +15 PushEvent +29 WatchEvent +9 IssuesEvent +7 PullRequestReviewCommentEvent +3 ForkEvent +35 PullRequestEvent + Time (abs ≡): 0.856 s [User: 0.741 s, System: 0.178 s] + +About to execute +================ +datafusion-cli --file /mnt/tmpdir/tmp.cv7JPVFkc6 + +With query +========== +SELECT count(),type +FROM '/mnt/gha.parquet' +WHERE repo.name='duckdb/duckdb' +GROUP BY type + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'datafusion-cli --file /mnt/tmpdir/tmp.cv7JPVFkc6' +Benchmark 1: datafusion-cli --file /mnt/tmpdir/tmp.cv7JPVFkc6 +DataFusion CLI v43.0.0 ++---------+-------------------------------+ +| count() | type | ++---------+-------------------------------+ +| 35 | PullRequestEvent | +| 14 | PullRequestReviewEvent | +| 7 | PullRequestReviewCommentEvent | +| 3 | ForkEvent | +| 15 | PushEvent | +| 30 | IssueCommentEvent | +| 9 | IssuesEvent | +| 29 | WatchEvent | ++---------+-------------------------------+ +8 row(s) fetched. +Elapsed 0.324 seconds. + + Time (abs ≡): 0.354 s [User: 1.299 s, System: 0.413 s] + +About to execute +================ +duckdb /mnt/gha.db < /mnt/tmpdir/tmp.qaqGNHoHPE + +With query +========== +SELECT count(),type +FROM 'gha' +WHERE repo.name='duckdb/duckdb' +GROUP BY type + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb /mnt/gha.db < /mnt/tmpdir/tmp.qaqGNHoHPE' +Benchmark 1: duckdb /mnt/gha.db < /mnt/tmpdir/tmp.qaqGNHoHPE ┌──────────────┬───────────────────────────────┐ │ count_star() │ type │ │ int64 │ varchar │ ├──────────────┼───────────────────────────────┤ │ 3 │ ForkEvent │ -│ 35 │ PullRequestEvent │ +│ 14 │ PullRequestReviewEvent │ │ 29 │ WatchEvent │ +│ 35 │ PullRequestEvent │ +│ 30 │ IssueCommentEvent │ │ 7 │ PullRequestReviewCommentEvent │ │ 15 │ PushEvent │ │ 9 │ IssuesEvent │ -│ 14 │ PullRequestReviewEvent │ -│ 30 │ IssueCommentEvent │ └──────────────┴───────────────────────────────┘ -duckdb gha.db -c 0.49s user 0.06s system 466% cpu 0.119 total - -; time duckdb -c " - SELECT count(),type - FROM 'gha.parquet' - WHERE repo.name='duckdb/duckdb' - GROUP BY type -" + Time (abs ≡): 0.144 s [User: 0.770 s, System: 0.143 s] + +About to execute +================ +duckdb < /mnt/tmpdir/tmp.3BIyBWjqG0 + +With query +========== +SELECT count(),type +FROM '/mnt/gha.parquet' +WHERE repo.name='duckdb/duckdb' +GROUP BY type + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb < /mnt/tmpdir/tmp.3BIyBWjqG0' +Benchmark 1: duckdb < /mnt/tmpdir/tmp.3BIyBWjqG0 ┌──────────────┬───────────────────────────────┐ │ count_star() │ type │ │ int64 │ varchar │ ├──────────────┼───────────────────────────────┤ +│ 3 │ ForkEvent │ +│ 15 │ PushEvent │ │ 9 │ IssuesEvent │ │ 7 │ PullRequestReviewCommentEvent │ -│ 15 │ PushEvent │ │ 14 │ PullRequestReviewEvent │ -│ 3 │ ForkEvent │ │ 29 │ WatchEvent │ -│ 35 │ PullRequestEvent │ │ 30 │ IssueCommentEvent │ +│ 35 │ PullRequestEvent │ └──────────────┴───────────────────────────────┘ -duckdb -c 0.73s user 0.14s system 413% cpu 0.211 total - -; time datafusion-cli -c " - SELECT count(),type - FROM 'gha.parquet' - WHERE repo.name='duckdb/duckdb' - GROUP BY type -" -DataFusion CLI v43.0.0 -+---------+-------------------------------+ -| count() | type | -+---------+-------------------------------+ -| 15 | PushEvent | -| 35 | PullRequestEvent | -| 7 | PullRequestReviewCommentEvent | -| 14 | PullRequestReviewEvent | -| 30 | IssueCommentEvent | -| 9 | IssuesEvent | -| 29 | WatchEvent | -| 3 | ForkEvent | -+---------+-------------------------------+ -8 row(s) fetched. -Elapsed 0.200 seconds. + Time (abs ≡): 0.316 s [User: 1.473 s, System: 0.174 s] + +About to execute +================ +super -z -I /mnt/tmpdir/tmp.QieGBDCfVB + +With query +========== +SELECT count(),type +FROM '/mnt/gha.bsup' +WHERE repo.name='duckdb/duckdb' +GROUP BY type -datafusion-cli -c 0.80s user 0.15s system 398% cpu 0.238 total ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'super -z -I /mnt/tmpdir/tmp.QieGBDCfVB' +Benchmark 1: super -z -I /mnt/tmpdir/tmp.QieGBDCfVB +{type:"IssuesEvent",count:9(uint64)} +{type:"ForkEvent",count:3(uint64)} +{type:"PullRequestReviewCommentEvent",count:7(uint64)} +{type:"PullRequestReviewEvent",count:14(uint64)} +{type:"IssueCommentEvent",count:30(uint64)} +{type:"WatchEvent",count:29(uint64)} +{type:"PullRequestEvent",count:35(uint64)} +{type:"PushEvent",count:15(uint64)} + Time (abs ≡): 5.627 s [User: 15.358 s, System: 1.606 s] + +About to execute +================ +SUPER_VAM=1 super -z -I /mnt/tmpdir/tmp.XI76knYAGz + +With query +========== +SELECT count(),type +FROM '/mnt/gha.parquet' +WHERE repo.name='duckdb/duckdb' +GROUP BY type -; time clickhouse -q " - SELECT count(),type - FROM 'gha.parquet' - WHERE repo.name='duckdb/duckdb' - GROUP BY type -" -30 IssueCommentEvent -14 PullRequestReviewEvent -15 PushEvent -29 WatchEvent -9 IssuesEvent -7 PullRequestReviewCommentEvent -3 ForkEvent -35 PullRequestEvent -clickhouse -q 0.77s user 0.11s system 97% cpu 0.908 total ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'SUPER_VAM=1 super -z -I /mnt/tmpdir/tmp.XI76knYAGz' +Benchmark 1: SUPER_VAM=1 super -z -I /mnt/tmpdir/tmp.XI76knYAGz +{type:"PullRequestEvent",count:35(uint64)} +{type:"IssueCommentEvent",count:30(uint64)} +{type:"PushEvent",count:15(uint64)} +{type:"IssuesEvent",count:9(uint64)} +{type:"ForkEvent",count:3(uint64)} +{type:"PullRequestReviewCommentEvent",count:7(uint64)} +{type:"PullRequestReviewEvent",count:14(uint64)} +{type:"WatchEvent",count:29(uint64)} + Time (abs ≡): 0.498 s [User: 2.133 s, System: 0.329 s] ``` ### Union Test ``` -time super -c " - FROM 'gha.bsup' - | SELECT VALUE payload.pull_request - | WHERE this IS NOT NULL - | UNNEST [...assignees, assignee] - | WHERE this IS NOT NULL - | AGGREGATE count() BY assignee:=login - | ORDER BY count DESC - | LIMIT 5 -" -{assignee:"poad",count:1966(uint64)} -{assignee:"vinayakkulkarni",count:508(uint64)} -{assignee:"tmtmtmtm",count:356(uint64)} -{assignee:"AMatutat",count:260(uint64)} -{assignee:"danwinship",count:208(uint64)} -super -c 12.39s user 0.95s system 351% cpu 3.797 total - -; time duckdb gha.db -c " - WITH assignees AS ( - SELECT payload.pull_request.assignee.login assignee - FROM gha - UNION ALL - SELECT unnest(payload.pull_request.assignees).login assignee - FROM gha +About to execute +================ +clickhouse --queries-file /mnt/tmpdir/tmp.rpGStdRtoN + +With query +========== +WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM '/mnt/gha.parquet' + UNION ALL + SELECT arrayJoin(payload.pull_request.assignees).login assignee + FROM '/mnt/gha.parquet' +) +SELECT assignee, count(*) count +FROM assignees +WHERE assignee IS NOT NULL +GROUP BY assignee +ORDER BY count DESC +LIMIT 5 + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'clickhouse --queries-file /mnt/tmpdir/tmp.rpGStdRtoN' +Benchmark 1: clickhouse --queries-file /mnt/tmpdir/tmp.rpGStdRtoN +poad 1966 +vinayakkulkarni 508 +tmtmtmtm 356 +AMatutat 260 +danwinship 208 + Time (abs ≡): 70.276 s [User: 139.539 s, System: 6.504 s] + +About to execute +================ +datafusion-cli --file /mnt/tmpdir/tmp.V2yirMdQ2i + +With query +========== +WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM '/mnt/gha.parquet' + UNION ALL + SELECT object.login as assignee FROM ( + SELECT unnest(payload.pull_request.assignees) object + FROM '/mnt/gha.parquet' ) - SELECT assignee, count(*) count - FROM assignees - WHERE assignee NOT NULL - GROUP BY assignee - ORDER BY count DESC - LIMIT 5 -" +) +SELECT assignee, count() count +FROM assignees +WHERE assignee IS NOT NULL +GROUP BY assignee +ORDER BY count DESC +LIMIT 5 + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'datafusion-cli --file /mnt/tmpdir/tmp.V2yirMdQ2i' +Benchmark 1: datafusion-cli --file /mnt/tmpdir/tmp.V2yirMdQ2i +DataFusion CLI v43.0.0 ++-----------------+-------+ +| assignee | count | ++-----------------+-------+ +| poad | 1966 | +| vinayakkulkarni | 508 | +| tmtmtmtm | 356 | +| AMatutat | 260 | +| danwinship | 208 | ++-----------------+-------+ +5 row(s) fetched. +Elapsed 24.068 seconds. + + Time (abs ≡): 24.336 s [User: 161.911 s, System: 24.355 s] + +About to execute +================ +duckdb /mnt/gha.db < /mnt/tmpdir/tmp.yz1E2h5G10 + +With query +========== +WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM 'gha' + UNION ALL + SELECT unnest(payload.pull_request.assignees).login assignee + FROM 'gha' +) +SELECT assignee, count(*) count +FROM assignees +WHERE assignee IS NOT NULL +GROUP BY assignee +ORDER BY count DESC +LIMIT 5 + ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb /mnt/gha.db < /mnt/tmpdir/tmp.yz1E2h5G10' +Benchmark 1: duckdb /mnt/gha.db < /mnt/tmpdir/tmp.yz1E2h5G10 ┌─────────────────┬───────┐ │ assignee │ count │ │ varchar │ int64 │ @@ -1299,23 +1574,30 @@ super -c 12.39s user 0.95s system 351% cpu 3.797 total │ AMatutat │ 260 │ │ danwinship │ 208 │ └─────────────────┴───────┘ -duckdb gha.db -c 3119.93s user 90.86s system 719% cpu 7:26.22 total + Time (abs ≡): 519.227 s [User: 4075.550 s, System: 14.520 s] + +About to execute +================ +duckdb < /mnt/tmpdir/tmp.30X1TO2UbL + +With query +========== +WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM '/mnt/gha.parquet' + UNION ALL + SELECT unnest(payload.pull_request.assignees).login assignee + FROM '/mnt/gha.parquet' +) +SELECT assignee, count(*) count +FROM assignees +WHERE assignee IS NOT NULL +GROUP BY assignee +ORDER BY count DESC +LIMIT 5 -time duckdb -c " - WITH assignees AS ( - SELECT payload.pull_request.assignee.login assignee - FROM 'gha.parquet' - UNION ALL - SELECT unnest(payload.pull_request.assignees).login assignee - FROM 'gha.parquet' - ) - SELECT assignee, count(*) count - FROM assignees - WHERE assignee NOT NULL - GROUP BY assignee - ORDER BY count DESC - LIMIT 5 -" ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'duckdb < /mnt/tmpdir/tmp.30X1TO2UbL' +Benchmark 1: duckdb < /mnt/tmpdir/tmp.30X1TO2UbL ┌─────────────────┬───────┐ │ assignee │ count │ │ varchar │ int64 │ @@ -1326,59 +1608,27 @@ time duckdb -c " │ AMatutat │ 260 │ │ danwinship │ 208 │ └─────────────────┴───────┘ -duckdb -c 2914.72s user 107.15s system 721% cpu 6:58.68 total - -time datafusion-cli -c " - WITH assignees AS ( - SELECT payload.pull_request.assignee.login assignee - FROM 'gha.parquet' - UNION ALL - SELECT object.login as assignee FROM ( - SELECT unnest(payload.pull_request.assignees) object - FROM 'gha.parquet' - ) - ) - SELECT assignee, count() count - FROM assignees - WHERE assignee IS NOT NULL - GROUP BY assignee - ORDER BY count DESC - LIMIT 5 -" -DataFusion CLI v43.0.0 -+-----------------+-------+ -| assignee | count | -+-----------------+-------+ -| poad | 1966 | -| vinayakkulkarni | 508 | -| tmtmtmtm | 356 | -| AMatutat | 260 | -| danwinship | 208 | -+-----------------+-------+ -5 row(s) fetched. -Elapsed 39.012 seconds. - -datafusion-cli -c 116.97s user 44.50s system 408% cpu 39.533 total + Time (abs ≡): 499.909 s [User: 3718.128 s, System: 9.680 s] + +About to execute +================ +super -z -I /mnt/tmpdir/tmp.3qO9ALablA + +With query +========== +FROM '/mnt/gha.bsup' +| UNNEST [...payload.pull_request.assignees, payload.pull_request.assignee] +| WHERE this IS NOT NULL +| AGGREGATE count() BY assignee:=login +| ORDER BY count DESC +| LIMIT 5 -; time clickhouse -q " - WITH assignees AS ( - SELECT payload.pull_request.assignee.login assignee - FROM 'gha.parquet' - UNION ALL - SELECT arrayJoin(payload.pull_request.assignees).login assignee - FROM 'gha.parquet' - ) - SELECT assignee, count(*) count - FROM assignees - WHERE assignee IS NOT NULL - GROUP BY assignee - ORDER BY count DESC - LIMIT 5 -" -poad 1966 -vinayakkulkarni 508 -tmtmtmtm 356 -AMatutat 260 -danwinship 208 -clickhouse -q 105.49s user 6.54s system 169% cpu 1:06.27 total ++ hyperfine --show-output --warmup 1 --runs 1 --time-unit second 'super -z -I /mnt/tmpdir/tmp.3qO9ALablA' +Benchmark 1: super -z -I /mnt/tmpdir/tmp.3qO9ALablA +{assignee:"poad",count:1966(uint64)} +{assignee:"vinayakkulkarni",count:508(uint64)} +{assignee:"tmtmtmtm",count:356(uint64)} +{assignee:"AMatutat",count:260(uint64)} +{assignee:"danwinship",count:208(uint64)} + Time (abs ≡): 8.180 s [User: 17.197 s, System: 1.909 s] ``` diff --git a/scripts/super-cmd-perf/README.md b/scripts/super-cmd-perf/README.md new file mode 100644 index 0000000000..4465c71574 --- /dev/null +++ b/scripts/super-cmd-perf/README.md @@ -0,0 +1,106 @@ +# Query Performance From `super` Command Doc + +These scripts were used to generate the results in the +[Performance](https://zed.brimdata.io/docs/next/commands/super#performance) +section of the [`super` command doc](https://zed.brimdata.io/docs/next/commands/super). +The scripts have been made available to allow for easy reproduction of the +results under different conditions and/or as tested systems evolve. + +# Environments + +The scripts were written to be easily run in two different environments. + +## AWS + +As an environment that's available to everyone, the scripts were developed +primarily for use on a "scratch" EC2 instance in [AWS](https://aws.amazon.com/). +Specifically, we chose the [`m6idn.2xlarge`](https://aws.amazon.com/ec2/instance-types/m6i/) +instance that has the following specifications: + +* 8x vCPU +* 32 GB of RAM +* 474 GB NVMe instance SSD + +The instance SSD in particular was seen as important to ensure consistent I/O +performance. + +Assuming a freshly-created `m6idn.2xlarge` instance running Ubuntu 24.04, to +start the run: + +``` +curl -s https://github.com/brimdata/super/blob/main/scripts/super-cmd-perf/benchmark.sh | bash -xv 2>&1 | tee runlog.txt +``` + +The run proceeds in three phases: + +1. **(AWS only)** Instance SSD is formatted and required tools & data platforms tools are downloaded/installed +2. Test data is downloaded and loaded into needed storage formats +3. Queries are executed on all data platforms + +As the benchmarks may take a long time to run, the use of [`screen`](https://www.gnu.org/software/screen/) +or a similar "detachable" terminal tool is recommended in case your remote +network connection drops during a run. + +## macOS/other + +Whereas on [AWS](#aws) the scripts assume they're in a "scratch" environment +where it may format the instance SSD for optimal storage and install required +software, on other systems such as macOS it's assumed the required data +platforms are already installed, and it will skip ahead right to +downloading/loading test data and then running queries. + +For instance on macOS, the software needed can be first installed via: + +``` +brew install hyperfine datafusion duckdb clickhouse go +go install github.com/brimdata/super/cmd/super@main +``` + +Then clone the [super repo](https://github.com/brimdata/super.git) and run the +benchmarks. + +``` +git clone https://github.com/brimdata/super.git +cd scripts/super-cmd-perf +./benchmark.sh +``` + +All test data will remain in this directory. + +# Results + +Results from the run will accumulate in a subdirectory named for the date/time +when the run started, e.g., `2024-11-19_01:10:30/`. In this directory, summary +reports will be created in files ending in `.md` and `.csv` extensions, and +details from each individual step in generating the results will be in files +ending in `.out`. If run on AWS using the [`curl` command line shown above](#aws), +the `runlog.txt` will also be present that holds the full console output of the +entire run. + +An archive of results from our most recent run of the benchmarks on November +26, 2024 can be downloaded [here](https://super-cmd-perf.s3.us-east-2.amazonaws.com/2024-11-26_03-17-25.tgz). + +# Debugging + +The scripts are configured to exit immediately if failures occur during the +run. If you encounter a failure, look in the results directory for the `.out` +file mentioned last in the console output as this will contain any detailed +error message from the operation that experienced the failure. + +A problem that was encountered when developing the scripts that you may also +encounter is DuckDB running out of memory. Specifically, this happened when +we tried to run the scripts on an Intel-based Macbook with only 16 GB of +RAM, and this is part of why we used an AWS instance with 32 GB of RAM as the +reference platform. On the Macbooks, we found we could work around the memory +problem by telling DuckDB it had the use of more memory than its default +[80% heuristic for `memory_limit`](https://duckdb.org/docs/configuration/overview.html). +The scripts support an environment variable to make it easy to increase this +value, e.g., we found the scripts ran successfully at 16 GB: + +``` +$ DUCKDB_MEMORY_LIMIT="16GB" ./benchmark.sh +``` + +Of course, this ultimately caused swapping on our Macbook and a significant +hit to performance, but it at least allowed the scripts to run without +failure. diff --git a/scripts/super-cmd-perf/benchmark.sh b/scripts/super-cmd-perf/benchmark.sh new file mode 100755 index 0000000000..35c9449388 --- /dev/null +++ b/scripts/super-cmd-perf/benchmark.sh @@ -0,0 +1,97 @@ +#!/bin/bash -xv +set -euo pipefail +export RUNNING_ON_AWS_EC2="" + +# If we can detect we're running on an AWS EC2 m6idn.2xlarge instance, we'll +# treat it as a scratch host, installing all needed software and using the +# local SSD for best I/O performance. +if command -v dmidecode && [ "$(sudo dmidecode --string system-uuid | cut -c1-3)" == "ec2" ] && [ "$(TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") && curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type)" == "m6idn.2xlarge" ]; then + + export RUNNING_ON_AWS_EC2=true + + sudo apt-get -y update + sudo apt-get -y upgrade + sudo apt-get -y install make gcc unzip hyperfine + + # Prepare local SSD for best I/O performance + sudo fdisk -l /dev/nvme1n1 + sudo mkfs.ext4 -E discard -F /dev/nvme1n1 + sudo mount /dev/nvme1n1 /mnt + sudo chown ubuntu:ubuntu /mnt + sudo chmod 777 /mnt + echo 'export TMPDIR="/mnt/tmpdir"' >> "$HOME"/.profile + mkdir /mnt/tmpdir + + # Install ClickHouse + if ! command -v clickhouse-client > /dev/null 2>&1; then + sudo apt-get install -y apt-transport-https ca-certificates curl gnupg + curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb stable main" | sudo tee \ + /etc/apt/sources.list.d/clickhouse.list + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y clickhouse-client + fi + + # Install DuckDB + if ! command -v duckdb > /dev/null 2>&1; then + curl -L -O https://github.com/duckdb/duckdb/releases/download/v1.1.3/duckdb_cli-linux-amd64.zip + unzip duckdb_cli-linux-amd64.zip + sudo mv duckdb /usr/local/bin + fi + + # Install Rust + curl -L -O https://static.rust-lang.org/dist/rust-1.82.0-x86_64-unknown-linux-gnu.tar.xz + tar xf rust-1.82.0-x86_64-unknown-linux-gnu.tar.xz + sudo rust-1.82.0-x86_64-unknown-linux-gnu/install.sh + # shellcheck disable=SC2016 + echo 'export PATH="$PATH:$HOME/.cargo/bin"' >> "$HOME"/.profile + + # Install DataFusion CLI + if ! command -v datafusion-cli > /dev/null 2>&1; then + cargo install datafusion-cli + fi + + # Install Go + if ! command -v go > /dev/null 2>&1; then + curl -L -O https://go.dev/dl/go1.23.3.linux-amd64.tar.gz + rm -rf /usr/local/go && sudo tar -C /usr/local -xzf go1.23.3.linux-amd64.tar.gz + # shellcheck disable=SC2016 + echo 'export PATH="$PATH:/usr/local/go/bin:$HOME/go/bin"' >> "$HOME"/.profile + source "$HOME"/.profile + fi + + # Install SuperDB + if ! command -v super > /dev/null 2>&1; then + git clone https://github.com/brimdata/super.git + cd super + make install + fi + + cd scripts/super-cmd-perf + +fi + +rundir="$(date +%F_%T)" +mkdir "$rundir" +report="$rundir/report_$rundir.md" + +echo -e "|**Software**|**Version**|\n|-|-|" | tee -a "$report" +for software in super duckdb datafusion-cli clickhouse +do + if ! command -v $software > /dev/null; then + echo "error: \"$software\" not found in PATH" + exit 1 + fi + echo "|$software|$($software --version)|" | tee -a "$report" +done +echo >> "$report" + +# Prepare the test data +./prep-data.sh "$rundir" + +# Run the queries and generate the summary report +./run-queries.sh "$rundir" + +if [ -n "$RUNNING_ON_AWS_EC2" ]; then + mv "$HOME/runlog.txt" "$rundir" +fi diff --git a/scripts/super-cmd-perf/prep-data.sh b/scripts/super-cmd-perf/prep-data.sh new file mode 100755 index 0000000000..a1035092bf --- /dev/null +++ b/scripts/super-cmd-perf/prep-data.sh @@ -0,0 +1,58 @@ +#!/bin/bash -xv +set -euo pipefail +pushd "$(cd "$(dirname "$0")" && pwd)" + +if [ "$#" -ne 1 ]; then + echo "Specify results directory string" + exit 1 +fi +rundir="$(pwd)/$1" +mkdir -p "$rundir" + +RUNNING_ON_AWS_EC2="${RUNNING_ON_AWS_EC2:-}" +if [ -n "$RUNNING_ON_AWS_EC2" ]; then + cd /mnt +fi + +function run_cmd { + outputfile="$1" + shift + { hyperfine \ + --show-output \ + --warmup 0 \ + --runs 1 \ + --time-unit second \ + "$@" ; + } \ + > "$outputfile" \ + 2>&1 +} + +mkdir gharchive_gz +cd gharchive_gz +for num in $(seq 0 23) +do + curl -L -O "https://data.gharchive.org/2023-02-08-${num}.json.gz" +done +cd .. + +DUCKDB_MEMORY_LIMIT="${DUCKDB_MEMORY_LIMIT:-}" +if [ -n "$DUCKDB_MEMORY_LIMIT" ]; then + increase_duckdb_memory_limit='SET memory_limit = '\'"${DUCKDB_MEMORY_LIMIT}"\''; ' +else + increase_duckdb_memory_limit="" +fi + +run_cmd \ + "$rundir/duckdb-table-create.out" \ + "duckdb gha.db -c \"${increase_duckdb_memory_limit}CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', union_by_name=true)\"" + +run_cmd \ + "$rundir/duckdb-parquet-create.out" \ + "duckdb gha.db -c \"${increase_duckdb_memory_limit}COPY (from gha) TO 'gha.parquet'\"" + +run_cmd \ + "$rundir/super-bsup-create.out" \ + "super -o gha.bsup gharchive_gz/*.json.gz" + +du -h gha.db gha.parquet gha.bsup gharchive_gz diff --git a/scripts/super-cmd-perf/queries/agg.sql b/scripts/super-cmd-perf/queries/agg.sql new file mode 100644 index 0000000000..074916587c --- /dev/null +++ b/scripts/super-cmd-perf/queries/agg.sql @@ -0,0 +1,4 @@ +SELECT count(),type +FROM '__SOURCE__' +WHERE repo.name='duckdb/duckdb' +GROUP BY type diff --git a/scripts/super-cmd-perf/queries/count.sql b/scripts/super-cmd-perf/queries/count.sql new file mode 100644 index 0000000000..289bbbd13c --- /dev/null +++ b/scripts/super-cmd-perf/queries/count.sql @@ -0,0 +1,3 @@ +SELECT count() +FROM '__SOURCE__' +WHERE actor.login='johnbieren' diff --git a/scripts/super-cmd-perf/queries/search+.spq b/scripts/super-cmd-perf/queries/search+.spq new file mode 100644 index 0000000000..5d081071f0 --- /dev/null +++ b/scripts/super-cmd-perf/queries/search+.spq @@ -0,0 +1,3 @@ +SELECT count() +FROM '__SOURCE__' +WHERE grep('in case you have any feedback 😊') diff --git a/scripts/super-cmd-perf/queries/search+.sql b/scripts/super-cmd-perf/queries/search+.sql new file mode 100644 index 0000000000..7ca7ee7efd --- /dev/null +++ b/scripts/super-cmd-perf/queries/search+.sql @@ -0,0 +1,489 @@ +SELECT count() +FROM '__SOURCE__' +WHERE + id LIKE '%in case you have any feedback 😊%' + OR type LIKE '%in case you have any feedback 😊%' + OR actor.login LIKE '%in case you have any feedback 😊%' + OR actor.display_login LIKE '%in case you have any feedback 😊%' + OR actor.gravatar_id LIKE '%in case you have any feedback 😊%' + OR actor.url LIKE '%in case you have any feedback 😊%' + OR actor.avatar_url LIKE '%in case you have any feedback 😊%' + OR repo.name LIKE '%in case you have any feedback 😊%' + OR repo.url LIKE '%in case you have any feedback 😊%' + OR payload.ref LIKE '%in case you have any feedback 😊%' + OR payload.ref_type LIKE '%in case you have any feedback 😊%' + OR payload.pusher_type LIKE '%in case you have any feedback 😊%' + OR payload.head LIKE '%in case you have any feedback 😊%' + OR payload.before LIKE '%in case you have any feedback 😊%' + OR payload.master_branch LIKE '%in case you have any feedback 😊%' + OR payload.description LIKE '%in case you have any feedback 😊%' + OR payload.action LIKE '%in case you have any feedback 😊%' + OR org.login LIKE '%in case you have any feedback 😊%' + OR org.gravatar_id LIKE '%in case you have any feedback 😊%' + OR org.url LIKE '%in case you have any feedback 😊%' + OR org.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.review.node_id LIKE '%in case you have any feedback 😊%' + OR payload.review.user.login LIKE '%in case you have any feedback 😊%' + OR payload.review.user.node_id LIKE '%in case you have any feedback 😊%' + OR payload.review.user.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.review.user.url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.html_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.following_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.events_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.review.user.type LIKE '%in case you have any feedback 😊%' + OR payload.review.body LIKE '%in case you have any feedback 😊%' + OR payload.review.commit_id LIKE '%in case you have any feedback 😊%' + OR payload.review.state LIKE '%in case you have any feedback 😊%' + OR payload.review.html_url LIKE '%in case you have any feedback 😊%' + OR payload.review.pull_request_url LIKE '%in case you have any feedback 😊%' + OR payload.review.author_association LIKE '%in case you have any feedback 😊%' + OR payload.review._links.html.href LIKE '%in case you have any feedback 😊%' + OR payload.review._links.pull_request.href LIKE '%in case you have any feedback 😊%' + OR payload.comment.url LIKE '%in case you have any feedback 😊%' + OR payload.comment.html_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.node_id LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.login LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.node_id LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.html_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.following_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.events_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.user.type LIKE '%in case you have any feedback 😊%' + OR payload.comment.path LIKE '%in case you have any feedback 😊%' + OR payload.comment.commit_id LIKE '%in case you have any feedback 😊%' + OR payload.comment.author_association LIKE '%in case you have any feedback 😊%' + OR payload.comment.body LIKE '%in case you have any feedback 😊%' + OR payload.comment.reactions.url LIKE '%in case you have any feedback 😊%' + OR payload.comment.issue_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.diff_hunk LIKE '%in case you have any feedback 😊%' + OR payload.comment.original_commit_id LIKE '%in case you have any feedback 😊%' + OR payload.comment.pull_request_url LIKE '%in case you have any feedback 😊%' + OR payload.comment.start_side LIKE '%in case you have any feedback 😊%' + OR payload.comment.side LIKE '%in case you have any feedback 😊%' + OR payload.issue.url LIKE '%in case you have any feedback 😊%' + OR payload.issue.repository_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.labels_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.comments_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.events_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.html_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.node_id LIKE '%in case you have any feedback 😊%' + OR payload.issue.title LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.login LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.node_id LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.html_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.following_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.events_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.user.type LIKE '%in case you have any feedback 😊%' + OR payload.issue.state LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.login LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.node_id LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.html_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.following_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.events_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.assignee.type LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.html_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.labels_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.node_id LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.title LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.description LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.login LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.node_id LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.html_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.following_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.events_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.creator.type LIKE '%in case you have any feedback 😊%' + OR payload.issue.milestone.state LIKE '%in case you have any feedback 😊%' + OR payload.issue.author_association LIKE '%in case you have any feedback 😊%' + OR payload.issue.active_lock_reason LIKE '%in case you have any feedback 😊%' + OR payload.issue.body LIKE '%in case you have any feedback 😊%' + OR payload.issue.reactions.url LIKE '%in case you have any feedback 😊%' + OR payload.issue.timeline_url LIKE '%in case you have any feedback 😊%' + OR payload.issue.state_reason LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.diff_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.patch_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.issue_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.state LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.title LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.login LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.following_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.user.type LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.body LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merge_commit_sha LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.commits_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.review_comments_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.review_comment_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.comments_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.statuses_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.label LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.ref LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.sha LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.login LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.following_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.user.type LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.name LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.full_name LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.login LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.following_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.owner.type LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.description LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.forks_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.keys_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.collaborators_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.teams_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.hooks_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.issue_events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.assignees_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.branches_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.tags_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.blobs_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.git_tags_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.git_refs_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.trees_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.statuses_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.languages_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.stargazers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.contributors_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.subscribers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.subscription_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.commits_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.git_commits_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.comments_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.issue_comment_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.contents_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.compare_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.merges_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.archive_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.downloads_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.issues_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.pulls_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.milestones_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.notifications_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.labels_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.releases_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.deployments_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.git_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.ssh_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.clone_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.svn_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.homepage LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.language LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.mirror_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.visibility LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.head.repo.default_branch LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.label LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.ref LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.sha LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.login LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.following_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.user.type LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.name LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.full_name LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.login LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.following_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.owner.type LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.description LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.forks_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.keys_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.collaborators_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.teams_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.hooks_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.issue_events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.assignees_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.branches_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.tags_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.blobs_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.git_tags_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.git_refs_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.trees_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.statuses_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.languages_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.stargazers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.contributors_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.subscribers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.subscription_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.commits_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.git_commits_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.comments_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.issue_comment_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.contents_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.compare_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.merges_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.archive_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.downloads_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.issues_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.pulls_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.milestones_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.notifications_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.labels_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.releases_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.deployments_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.git_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.ssh_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.clone_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.svn_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.homepage LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.language LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.mirror_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.visibility LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.base.repo.default_branch LIKE '%in case you have any feedback 😊%' + OR payload.pull_request._links.self.href LIKE '%in case you have any feedback 😊%' + OR payload.pull_request._links.html.href LIKE '%in case you have any feedback 😊%' + OR payload.pull_request._links.issue.href LIKE '%in case you have any feedback 😊%' + OR payload.pull_request._links.comments.href LIKE '%in case you have any feedback 😊%' + OR payload.pull_request._links.review_comments.href LIKE '%in case you have any feedback 😊%' + OR payload.pull_request._links.review_comment.href LIKE '%in case you have any feedback 😊%' + OR payload.pull_request._links.commits.href LIKE '%in case you have any feedback 😊%' + OR payload.pull_request._links.statuses.href LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.author_association LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.active_lock_reason LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.mergeable_state LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.login LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.node_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.html_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.following_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.pull_request.merged_by.type LIKE '%in case you have any feedback 😊%' + OR payload.forkee.node_id LIKE '%in case you have any feedback 😊%' + OR payload.forkee.name LIKE '%in case you have any feedback 😊%' + OR payload.forkee.full_name LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.login LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.node_id LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.html_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.following_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.events_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.owner.type LIKE '%in case you have any feedback 😊%' + OR payload.forkee.html_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.description LIKE '%in case you have any feedback 😊%' + OR payload.forkee.url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.forks_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.keys_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.collaborators_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.teams_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.hooks_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.issue_events_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.events_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.assignees_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.branches_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.tags_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.blobs_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.git_tags_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.git_refs_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.trees_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.statuses_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.languages_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.stargazers_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.contributors_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.subscribers_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.subscription_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.commits_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.git_commits_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.comments_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.issue_comment_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.contents_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.compare_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.merges_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.archive_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.downloads_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.issues_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.pulls_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.milestones_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.notifications_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.labels_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.releases_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.deployments_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.git_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.ssh_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.clone_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.svn_url LIKE '%in case you have any feedback 😊%' + OR payload.forkee.homepage LIKE '%in case you have any feedback 😊%' + OR payload.forkee.visibility LIKE '%in case you have any feedback 😊%' + OR payload.forkee.default_branch LIKE '%in case you have any feedback 😊%' + OR payload.release.url LIKE '%in case you have any feedback 😊%' + OR payload.release.assets_url LIKE '%in case you have any feedback 😊%' + OR payload.release.upload_url LIKE '%in case you have any feedback 😊%' + OR payload.release.html_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.login LIKE '%in case you have any feedback 😊%' + OR payload.release.author.node_id LIKE '%in case you have any feedback 😊%' + OR payload.release.author.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.release.author.url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.html_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.following_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.events_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.release.author.type LIKE '%in case you have any feedback 😊%' + OR payload.release.node_id LIKE '%in case you have any feedback 😊%' + OR payload.release.tag_name LIKE '%in case you have any feedback 😊%' + OR payload.release.target_commitish LIKE '%in case you have any feedback 😊%' + OR payload.release.name LIKE '%in case you have any feedback 😊%' + OR payload.release.tarball_url LIKE '%in case you have any feedback 😊%' + OR payload.release.zipball_url LIKE '%in case you have any feedback 😊%' + OR payload.release.body LIKE '%in case you have any feedback 😊%' + OR payload.release.short_description_html LIKE '%in case you have any feedback 😊%' + OR payload.release.discussion_url LIKE '%in case you have any feedback 😊%' + OR payload.member.login LIKE '%in case you have any feedback 😊%' + OR payload.member.node_id LIKE '%in case you have any feedback 😊%' + OR payload.member.avatar_url LIKE '%in case you have any feedback 😊%' + OR payload.member.gravatar_id LIKE '%in case you have any feedback 😊%' + OR payload.member.url LIKE '%in case you have any feedback 😊%' + OR payload.member.html_url LIKE '%in case you have any feedback 😊%' + OR payload.member.followers_url LIKE '%in case you have any feedback 😊%' + OR payload.member.following_url LIKE '%in case you have any feedback 😊%' + OR payload.member.gists_url LIKE '%in case you have any feedback 😊%' + OR payload.member.starred_url LIKE '%in case you have any feedback 😊%' + OR payload.member.subscriptions_url LIKE '%in case you have any feedback 😊%' + OR payload.member.organizations_url LIKE '%in case you have any feedback 😊%' + OR payload.member.repos_url LIKE '%in case you have any feedback 😊%' + OR payload.member.events_url LIKE '%in case you have any feedback 😊%' + OR payload.member.received_events_url LIKE '%in case you have any feedback 😊%' + OR payload.member.type LIKE '%in case you have any feedback 😊%' diff --git a/scripts/super-cmd-perf/queries/search.spq b/scripts/super-cmd-perf/queries/search.spq new file mode 100644 index 0000000000..b289be1760 --- /dev/null +++ b/scripts/super-cmd-perf/queries/search.spq @@ -0,0 +1,3 @@ +SELECT count() +FROM '__SOURCE__' +WHERE grep('in case you have any feedback 😊', payload.pull_request.body) diff --git a/scripts/super-cmd-perf/queries/search.sql b/scripts/super-cmd-perf/queries/search.sql new file mode 100644 index 0000000000..b4feba2741 --- /dev/null +++ b/scripts/super-cmd-perf/queries/search.sql @@ -0,0 +1,3 @@ +SELECT count() +FROM '__SOURCE__' +WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' diff --git a/scripts/super-cmd-perf/queries/union-clickhouse.sql b/scripts/super-cmd-perf/queries/union-clickhouse.sql new file mode 100644 index 0000000000..ecea189b37 --- /dev/null +++ b/scripts/super-cmd-perf/queries/union-clickhouse.sql @@ -0,0 +1,13 @@ +WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM '__SOURCE__' + UNION ALL + SELECT arrayJoin(payload.pull_request.assignees).login assignee + FROM '__SOURCE__' +) +SELECT assignee, count(*) count +FROM assignees +WHERE assignee IS NOT NULL +GROUP BY assignee +ORDER BY count DESC +LIMIT 5 diff --git a/scripts/super-cmd-perf/queries/union-datafusion.sql b/scripts/super-cmd-perf/queries/union-datafusion.sql new file mode 100644 index 0000000000..5346c3d211 --- /dev/null +++ b/scripts/super-cmd-perf/queries/union-datafusion.sql @@ -0,0 +1,15 @@ +WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM '__SOURCE__' + UNION ALL + SELECT object.login as assignee FROM ( + SELECT unnest(payload.pull_request.assignees) object + FROM '__SOURCE__' + ) +) +SELECT assignee, count() count +FROM assignees +WHERE assignee IS NOT NULL +GROUP BY assignee +ORDER BY count DESC +LIMIT 5 diff --git a/scripts/super-cmd-perf/queries/union.spq b/scripts/super-cmd-perf/queries/union.spq new file mode 100644 index 0000000000..bb53c5fe9f --- /dev/null +++ b/scripts/super-cmd-perf/queries/union.spq @@ -0,0 +1,6 @@ +FROM '__SOURCE__' +| UNNEST [...payload.pull_request.assignees, payload.pull_request.assignee] +| WHERE this IS NOT NULL +| AGGREGATE count() BY assignee:=login +| ORDER BY count DESC +| LIMIT 5 diff --git a/scripts/super-cmd-perf/queries/union.sql b/scripts/super-cmd-perf/queries/union.sql new file mode 100644 index 0000000000..163e9db4e1 --- /dev/null +++ b/scripts/super-cmd-perf/queries/union.sql @@ -0,0 +1,13 @@ +WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM '__SOURCE__' + UNION ALL + SELECT unnest(payload.pull_request.assignees).login assignee + FROM '__SOURCE__' +) +SELECT assignee, count(*) count +FROM assignees +WHERE assignee IS NOT NULL +GROUP BY assignee +ORDER BY count DESC +LIMIT 5 diff --git a/scripts/super-cmd-perf/run-queries.sh b/scripts/super-cmd-perf/run-queries.sh new file mode 100755 index 0000000000..d715738b6d --- /dev/null +++ b/scripts/super-cmd-perf/run-queries.sh @@ -0,0 +1,146 @@ +#!/bin/bash -xv +set -euo pipefail +pushd "$(cd "$(dirname "$0")" && pwd)" + +if [ "$#" -ne 1 ]; then + echo "Specify results directory string" + exit 1 +fi +rundir="$(pwd)/$1" +mkdir -p "$rundir" + +RUNNING_ON_AWS_EC2="${RUNNING_ON_AWS_EC2:-}" +if [ -n "$RUNNING_ON_AWS_EC2" ]; then + storage="/mnt/" +else + storage="" +fi + +warmups=1 +runs=1 +report="$rundir/report_$(basename "$rundir").md" +csv_report="$rundir/report_$(basename "$rundir").csv" + +function run_query { + cmd="$1" + shift + queryfile="$1" + shift + source="$1" + shift + outputfile="$rundir/$cmd-$queryfile-$source.out" + + final_query=$(mktemp) + + DUCKDB_MEMORY_LIMIT="${DUCKDB_MEMORY_LIMIT:-}" + if [ "$cmd" == "duckdb" ] && [ -n "$DUCKDB_MEMORY_LIMIT" ]; then + echo 'SET memory_limit = '\'"${DUCKDB_MEMORY_LIMIT}"\''; ' >> "$final_query" + fi + + if [ "$source" == "gha" ]; then + sed -e "s/__SOURCE__/$source/" "queries/$queryfile" >> "$final_query" + else + sed -e "s/__SOURCE__/${storage//\//\\/}${source}/" "queries/$queryfile" >> "$final_query" + fi + + if [ "$cmd" == "super" ]; then + if [ "$source" == "gha.parquet" ]; then + cmd="SUPER_VAM=1 super" + fi + cmd="$cmd -z -I $final_query" + elif [ "$cmd" == "duckdb" ]; then + if [ "$source" == "gha" ]; then + cmd="duckdb ${storage}gha.db" + fi + cmd="$cmd < $final_query" + elif [ "$cmd" == "datafusion" ]; then + cmd="datafusion-cli --file $final_query" + elif [ "$cmd" == "clickhouse" ]; then + cmd="clickhouse --queries-file $final_query" + fi + + echo -e "About to execute\n================\n$cmd\n\nWith query\n==========" > "$outputfile" + cat "$final_query" >> "$outputfile" + echo >> "$outputfile" + + { hyperfine \ + --show-output \ + --warmup $warmups \ + --runs $runs \ + --time-unit second \ + "$cmd" ; + } \ + >> "$outputfile" \ + 2>&1 + + rm -f "$final_query" +} + +echo "|**Tool**|**Format**|**search**|**search+**|**count**|**agg**|**union**|" >> "$report" +echo "|-|-|-|-|-|-|-|" >> "$report" +echo "Tool,Format,search,search+,count,agg,union" > "$csv_report" + +for source in gha.bsup gha.parquet +do + echo -n "|\`super\`|\`${source/gha./}\`|" >> "$report" + echo -n "super,${source/gha./}" >> "$csv_report" + for queryfile in search.spq search+.spq count.sql agg.sql union.spq + do + if [ "$source" == "gha.parquet" ] && { [ "$queryfile" == "search.spq" ] || [ "$queryfile" == "search+.spq" ] || [ "$queryfile" == "union.spq" ]; }; then + echo -n "N/A|" >> "$report" + echo -n ",N/A" >> "$csv_report" + continue + fi + run_query super $queryfile "$source" + result=$(grep Time < "$rundir/super-$queryfile-$source.out" | awk '{ print $4 }') + echo -n "$result" >> "$report" + echo -n "|" >> "$report" + echo -n ",$result" >> "$csv_report" + done + echo >> "$report" + echo >> "$csv_report" +done + +for source in gha gha.parquet +do + duckdb_source=${source/gha\./} + duckdb_source=${duckdb_source/gha/db} + echo -n "|\`duckdb\`|\`$duckdb_source\`|" >> "$report" + echo -n "duckdb,$duckdb_source" >> "$csv_report" + for queryfile in search.sql search+.sql count.sql agg.sql union.sql + do + run_query duckdb $queryfile "$source" + result=$(grep Time < "$rundir/duckdb-$queryfile-$source.out" | awk '{ print $4 }') + echo -n "$result" >> "$report" + echo -n "|" >> "$report" + echo -n ",$result" >> "$csv_report" + done + echo >> "$report" + echo >> "$csv_report" +done + +echo -n "|\`datafusion\`|\`parquet\`|" >> "$report" +echo -n "datafusion,parquet" >> "$csv_report" +for queryfile in search.sql search+.sql count.sql agg.sql union-datafusion.sql +do + run_query datafusion $queryfile gha.parquet + result=$(grep Time < "$rundir/datafusion-$queryfile-$source.out" | awk '{ print $4 }') + echo -n "$result" >> "$report" + echo -n "|" >> "$report" + echo -n ",$result" >> "$csv_report" +done +echo >> "$report" +echo >> "$csv_report" + +echo -n "|\`clickhouse\`|\`parquet\`|" >> "$report" +echo -n "clickhouse,parquet" >> "$csv_report" +for queryfile in search.sql search+.sql count.sql agg.sql union-clickhouse.sql +do + run_query clickhouse $queryfile gha.parquet + result=$(grep Time < "$rundir/clickhouse-$queryfile-$source.out" | awk '{ print $4 }') + echo -n "$result" >> "$report" + echo -n "|" >> "$report" + echo -n ",$result" >> "$csv_report" +done +echo >> "$report" +echo >> "$csv_report"