From fd8adff7228bd425e8607278ee10cd96dde9c14b Mon Sep 17 00:00:00 2001 From: Steven McCanne Date: Sun, 10 Nov 2024 04:49:37 -0800 Subject: [PATCH 1/4] update docs on super command for SQL/OLAP audience --- docs/commands/search.sql | 487 ++++++++++++++++++ docs/commands/super.md | 1003 +++++++++++++++++++++++++++++--------- 2 files changed, 1271 insertions(+), 219 deletions(-) create mode 100644 docs/commands/search.sql diff --git a/docs/commands/search.sql b/docs/commands/search.sql new file mode 100644 index 0000000000..f7465cfa80 --- /dev/null +++ b/docs/commands/search.sql @@ -0,0 +1,487 @@ +SELECT count() FROM gha WHERE + id LIKE '%in case you have any feedback 😊%' +OR type LIKE '%in case you have any feedback 😊%' +OR actor.login LIKE '%in case you have any feedback 😊%' +OR actor.display_login LIKE '%in case you have any feedback 😊%' +OR actor.gravatar_id LIKE '%in case you have any feedback 😊%' +OR actor.url LIKE '%in case you have any feedback 😊%' +OR actor.avatar_url LIKE '%in case you have any feedback 😊%' +OR repo.name LIKE '%in case you have any feedback 😊%' +OR repo.url LIKE '%in case you have any feedback 😊%' +OR payload.ref LIKE '%in case you have any feedback 😊%' +OR payload.ref_type LIKE '%in case you have any feedback 😊%' +OR payload.pusher_type LIKE '%in case you have any feedback 😊%' +OR payload.head LIKE '%in case you have any feedback 😊%' +OR payload.before LIKE '%in case you have any feedback 😊%' +OR payload.master_branch LIKE '%in case you have any feedback 😊%' +OR payload.description LIKE '%in case you have any feedback 😊%' +OR payload.action LIKE '%in case you have any feedback 😊%' +OR org.login LIKE '%in case you have any feedback 😊%' +OR org.gravatar_id LIKE '%in case you have any feedback 😊%' +OR org.url LIKE '%in case you have any feedback 😊%' +OR org.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.review.node_id LIKE '%in case you have any feedback 😊%' +OR payload.review.user.login LIKE '%in case you have any feedback 😊%' +OR payload.review.user.node_id LIKE '%in case you have any feedback 😊%' +OR payload.review.user.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.review.user.url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.html_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.following_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.events_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.review.user.type LIKE '%in case you have any feedback 😊%' +OR payload.review.body LIKE '%in case you have any feedback 😊%' +OR payload.review.commit_id LIKE '%in case you have any feedback 😊%' +OR payload.review.state LIKE '%in case you have any feedback 😊%' +OR payload.review.html_url LIKE '%in case you have any feedback 😊%' +OR payload.review.pull_request_url LIKE '%in case you have any feedback 😊%' +OR payload.review.author_association LIKE '%in case you have any feedback 😊%' +OR payload.review._links.html.href LIKE '%in case you have any feedback 😊%' +OR payload.review._links.pull_request.href LIKE '%in case you have any feedback 😊%' +OR payload.comment.url LIKE '%in case you have any feedback 😊%' +OR payload.comment.html_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.node_id LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.login LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.node_id LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.html_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.following_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.events_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.user.type LIKE '%in case you have any feedback 😊%' +OR payload.comment.path LIKE '%in case you have any feedback 😊%' +OR payload.comment.commit_id LIKE '%in case you have any feedback 😊%' +OR payload.comment.author_association LIKE '%in case you have any feedback 😊%' +OR payload.comment.body LIKE '%in case you have any feedback 😊%' +OR payload.comment.reactions.url LIKE '%in case you have any feedback 😊%' +OR payload.comment.issue_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.diff_hunk LIKE '%in case you have any feedback 😊%' +OR payload.comment.original_commit_id LIKE '%in case you have any feedback 😊%' +OR payload.comment.pull_request_url LIKE '%in case you have any feedback 😊%' +OR payload.comment.start_side LIKE '%in case you have any feedback 😊%' +OR payload.comment.side LIKE '%in case you have any feedback 😊%' +OR payload.issue.url LIKE '%in case you have any feedback 😊%' +OR payload.issue.repository_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.labels_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.comments_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.events_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.html_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.node_id LIKE '%in case you have any feedback 😊%' +OR payload.issue.title LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.login LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.node_id LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.html_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.following_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.events_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.user.type LIKE '%in case you have any feedback 😊%' +OR payload.issue.state LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.login LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.node_id LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.html_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.following_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.events_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.assignee.type LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.html_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.labels_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.node_id LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.title LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.description LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.login LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.node_id LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.html_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.following_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.events_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.creator.type LIKE '%in case you have any feedback 😊%' +OR payload.issue.milestone.state LIKE '%in case you have any feedback 😊%' +OR payload.issue.author_association LIKE '%in case you have any feedback 😊%' +OR payload.issue.active_lock_reason LIKE '%in case you have any feedback 😊%' +OR payload.issue.body LIKE '%in case you have any feedback 😊%' +OR payload.issue.reactions.url LIKE '%in case you have any feedback 😊%' +OR payload.issue.timeline_url LIKE '%in case you have any feedback 😊%' +OR payload.issue.state_reason LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.diff_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.patch_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.issue_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.state LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.title LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.login LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.following_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.user.type LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.body LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merge_commit_sha LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.commits_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.review_comments_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.review_comment_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.comments_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.statuses_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.label LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.ref LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.sha LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.login LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.following_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.user.type LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.name LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.full_name LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.login LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.following_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.owner.type LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.description LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.forks_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.keys_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.collaborators_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.teams_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.hooks_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.issue_events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.assignees_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.branches_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.tags_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.blobs_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.git_tags_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.git_refs_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.trees_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.statuses_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.languages_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.stargazers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.contributors_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.subscribers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.subscription_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.commits_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.git_commits_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.comments_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.issue_comment_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.contents_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.compare_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.merges_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.archive_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.downloads_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.issues_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.pulls_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.milestones_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.notifications_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.labels_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.releases_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.deployments_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.git_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.ssh_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.clone_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.svn_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.homepage LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.language LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.mirror_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.visibility LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.head.repo.default_branch LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.label LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.ref LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.sha LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.login LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.following_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.user.type LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.name LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.full_name LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.login LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.following_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.owner.type LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.description LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.forks_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.keys_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.collaborators_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.teams_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.hooks_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.issue_events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.assignees_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.branches_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.tags_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.blobs_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.git_tags_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.git_refs_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.trees_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.statuses_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.languages_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.stargazers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.contributors_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.subscribers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.subscription_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.commits_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.git_commits_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.comments_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.issue_comment_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.contents_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.compare_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.merges_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.archive_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.downloads_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.issues_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.pulls_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.milestones_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.notifications_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.labels_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.releases_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.deployments_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.git_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.ssh_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.clone_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.svn_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.homepage LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.language LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.mirror_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.visibility LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.base.repo.default_branch LIKE '%in case you have any feedback 😊%' +OR payload.pull_request._links.self.href LIKE '%in case you have any feedback 😊%' +OR payload.pull_request._links.html.href LIKE '%in case you have any feedback 😊%' +OR payload.pull_request._links.issue.href LIKE '%in case you have any feedback 😊%' +OR payload.pull_request._links.comments.href LIKE '%in case you have any feedback 😊%' +OR payload.pull_request._links.review_comments.href LIKE '%in case you have any feedback 😊%' +OR payload.pull_request._links.review_comment.href LIKE '%in case you have any feedback 😊%' +OR payload.pull_request._links.commits.href LIKE '%in case you have any feedback 😊%' +OR payload.pull_request._links.statuses.href LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.author_association LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.active_lock_reason LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.mergeable_state LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.login LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.node_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.html_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.following_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.pull_request.merged_by.type LIKE '%in case you have any feedback 😊%' +OR payload.forkee.node_id LIKE '%in case you have any feedback 😊%' +OR payload.forkee.name LIKE '%in case you have any feedback 😊%' +OR payload.forkee.full_name LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.login LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.node_id LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.html_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.following_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.events_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.owner.type LIKE '%in case you have any feedback 😊%' +OR payload.forkee.html_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.description LIKE '%in case you have any feedback 😊%' +OR payload.forkee.url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.forks_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.keys_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.collaborators_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.teams_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.hooks_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.issue_events_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.events_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.assignees_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.branches_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.tags_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.blobs_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.git_tags_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.git_refs_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.trees_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.statuses_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.languages_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.stargazers_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.contributors_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.subscribers_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.subscription_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.commits_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.git_commits_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.comments_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.issue_comment_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.contents_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.compare_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.merges_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.archive_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.downloads_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.issues_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.pulls_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.milestones_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.notifications_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.labels_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.releases_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.deployments_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.git_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.ssh_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.clone_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.svn_url LIKE '%in case you have any feedback 😊%' +OR payload.forkee.homepage LIKE '%in case you have any feedback 😊%' +OR payload.forkee.visibility LIKE '%in case you have any feedback 😊%' +OR payload.forkee.default_branch LIKE '%in case you have any feedback 😊%' +OR payload.release.url LIKE '%in case you have any feedback 😊%' +OR payload.release.assets_url LIKE '%in case you have any feedback 😊%' +OR payload.release.upload_url LIKE '%in case you have any feedback 😊%' +OR payload.release.html_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.login LIKE '%in case you have any feedback 😊%' +OR payload.release.author.node_id LIKE '%in case you have any feedback 😊%' +OR payload.release.author.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.release.author.url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.html_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.following_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.events_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.release.author.type LIKE '%in case you have any feedback 😊%' +OR payload.release.node_id LIKE '%in case you have any feedback 😊%' +OR payload.release.tag_name LIKE '%in case you have any feedback 😊%' +OR payload.release.target_commitish LIKE '%in case you have any feedback 😊%' +OR payload.release.name LIKE '%in case you have any feedback 😊%' +OR payload.release.tarball_url LIKE '%in case you have any feedback 😊%' +OR payload.release.zipball_url LIKE '%in case you have any feedback 😊%' +OR payload.release.body LIKE '%in case you have any feedback 😊%' +OR payload.release.short_description_html LIKE '%in case you have any feedback 😊%' +OR payload.release.discussion_url LIKE '%in case you have any feedback 😊%' +OR payload.member.login LIKE '%in case you have any feedback 😊%' +OR payload.member.node_id LIKE '%in case you have any feedback 😊%' +OR payload.member.avatar_url LIKE '%in case you have any feedback 😊%' +OR payload.member.gravatar_id LIKE '%in case you have any feedback 😊%' +OR payload.member.url LIKE '%in case you have any feedback 😊%' +OR payload.member.html_url LIKE '%in case you have any feedback 😊%' +OR payload.member.followers_url LIKE '%in case you have any feedback 😊%' +OR payload.member.following_url LIKE '%in case you have any feedback 😊%' +OR payload.member.gists_url LIKE '%in case you have any feedback 😊%' +OR payload.member.starred_url LIKE '%in case you have any feedback 😊%' +OR payload.member.subscriptions_url LIKE '%in case you have any feedback 😊%' +OR payload.member.organizations_url LIKE '%in case you have any feedback 😊%' +OR payload.member.repos_url LIKE '%in case you have any feedback 😊%' +OR payload.member.events_url LIKE '%in case you have any feedback 😊%' +OR payload.member.received_events_url LIKE '%in case you have any feedback 😊%' +OR payload.member.type LIKE '%in case you have any feedback 😊%' diff --git a/docs/commands/super.md b/docs/commands/super.md index 1faec39c6a..7329797807 100644 --- a/docs/commands/super.md +++ b/docs/commands/super.md @@ -7,13 +7,10 @@ sidebar_label: super > **TL;DR** `super` is a command-line tool that uses [SuperSQL](../language/README.md) > to query a variety of data formats in files, over HTTP, or in [S3](../integrations/amazon-s3.md) -> storage. It is particularly fast when operating on data in binary formats such as -> [Super Binary](../formats/bsup.md), [Super Columnar](../formats/csup.md), and -> [Parquet](https://github.com/apache/parquet-format). -> -> The `super` design philosophy blends the command-line, embedded database -> approach of SQLite and DuckDB with the query/search-tool approach -> of `jq`, `awk`, and `grep`. +> storage. Best performance is achieved when operating on data in binary formats such as +> [Super Binary](../formats/bsup.md), [Super Columnar](../formats/csup.md), +> [Parquet](https://github.com/apache/parquet-format), or +> [Arrow](https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format). ## Usage @@ -22,54 +19,64 @@ super [ options ] [ -c query ] input [ input ... ] ``` `super` is a command-line tool for processing data in diverse input -formats, powering data wrangling, search, analytics, and extensive transformations -using the [SuperSQL language](../language/README.md). A SuperSQL query may be extended with -[pipe syntax](https://research.google/pubs/sql-has-problems-we-can-fix-them-pipe-syntax-in-sql/) -to apply Boolean logic or keyword search to filter the input, transform, and/or analyze -the filtered stream. Output is written to one or more files or to -standard output. - -Each `input` argument must be a file path, an HTTP or HTTPS URL, -an S3 URL, or standard input specified with `-`. - -For built-in command help and a listing of all available options, -simply run `super` with no arguments. - -`super` supports a number of [input](#input-formats) and [output](#output-formats) formats, but [Super Binary](../formats/bsup.md) -tends to be the most space-efficient and most performant. Super Binary has efficiency similar to -[Avro](https://avro.apache.org) -and [Protocol Buffers](https://developers.google.com/protocol-buffers) -but its comprehensive [type system](../formats/zed.md) obviates -the need for schema specification or registries. -Also, the [Super JSON](../formats/jsup.md) format is human-readable and entirely one-to-one with Super Binary -so there is no need to represent non-readable formats like Avro or Protocol Buffers -in a clunky JSON encapsulated form. - -`super` typically operates on Super Binary-encoded data and when you want to inspect -human-readable bits of output, you merely format it as Super JSON, which is the -default format when output is directed to the terminal. Super Binary is the default -when redirecting to a non-terminal output like a file or pipe. +formats, providing data wrangling, search, analytics, and extensive transformations +using the [SuperSQL](../language/README.md) dialect of SQL. Any SQL query expression +may be extended with [pipe syntax](https://research.google/pubs/sql-has-problems-we-can-fix-them-pipe-syntax-in-sql/) +to filter, transform, and/or analyze input data. +Super's SQL pipes dialect is extensive, so much so that it can resemble +a log-search experience despite its SQL foundation. -When run with input arguments, each input's format is [automatically inferred](#auto-detection) -and each input is scanned -in the order appearing on the command line forming the input stream. +The `super` command works with data from ephemeral sources like files and URLs. +If you want to persist your data into a data lake for persistent storage, +check out the [`super db`](zed.md) set of commands. By invoking the `-c` option, a query expressed in the [SuperSQL language](../language/README.md) may be specified and applied to the input stream. -If no query is specified, the inputs are scanned without modification -and output in the desired format as [described below](#input-formats). This latter approach -provides a convenient means to convert files from one format to another. +Super's data model is based on super-structured data, meaning that all data +is both strongly _and_ dynamically typed and need not conform to a homogeneous +schema. The type structure is self-describing so it's easy to daisy-chain +queries and inspect data at any point in a complex query or data pipeline. +For example, there's no need for a set of Parquet input files to all be +schema-compatible and it's easy to mix and match Parquet with JSON across +queries. -When `super` is run with a query and no input arguments, then the query must -begin with -* a [`from`, `file`, or `get` operator](../language/operators/from.md), or -* an explicit or implied [`yield` operator](../language/operators/yield.md). +When processing JSON data, all values are converted to strongly typed values +that fit naturally alongside relational data so there is no need for a separate +"JSON type". Unlike SQL systems that integrate JSON data, +there isn't a JSON way to do things and a separate relational way +to do things. -In the case of a `yield` with no inputs, the query is run with -a single input value of `null`. This provides a convenient means to run in a -"calculator mode" where input is produced by the `yield` and can be operated upon -by the query, e.g., +Because there are no schemas, there is no schema inference, so inferred schemas +do not haphazardly change when input data changes in subtle ways. + +Each `input` argument to `super` must be a file path, an HTTP or HTTPS URL, +an S3 URL, or standard input specified with `-`. +These input arguments are treated as if a SQL "from" operator precedes +the provided query, e.g., +``` +super -c "from example.json | select typeof(this)" +``` +is equivalent to +``` +super -c "select typeof(this)" example.json +``` +Output is written to one or more files or to standard output in the format specified. + +When multiple input files are specified, they are processed in the order given as +if the data were provided by a single, concatenated "from" clause. + +If no query is specified with `-c`, the inputs are scanned without modification +and output in the desired format as [described below](#input-formats), +providing a convenient means to convert files from one format to another, e.g., +``` +super -f arrows file1.json file2.parquet file3.csv > file-combined.arrows +``` +When `super` is run with a query that has no "from" operator and no input arguments, +the SuperSQL query is fed a single `null` value analagous to SQL's default +input of a single empty row of an unnamed table. +This provides a convenient means to explore examples or run in a +"calculator mode", e.g., ```mdtest-command super -z -c '1+1' ``` @@ -77,10 +84,40 @@ emits ```mdtest-output 2 ``` -Note here that the query `1+1` [implies](../language/pipeline-model.md#implied-operators) -`yield 1+1`. +Note that SuperSQL's has syntactic shortcuts for interactive data exploration and +an expression that stands alone is a shortcut for `select value`, e.g., the query text +``` +1+1 +``` +is equivalent to +``` +select value 1+1 +``` +To learn more about shortcuts, refer to the SuperSQL +[documenation on shortcuts](../language/pipeline-model.md#implied-operators). + +For built-in command help and a listing of all available options, +simply run `super` with no arguments. + +## Data Formats + +`super` supports a number of [input](#input-formats) and [output](#output-formats) formats, but the super formats +([Super Binary](../formats/bsup.md), +[Super Columnar](../formats/csup.md), +and [Super JSON](../formats/jsup.md)) tend to the most versatile and +easy to work with. + +`super` typically operates on binary-encoded data and when you want to inspect +human-readable bits of output, you merely format it as Super JSON, which is the +default format when output is directed to the terminal. Super Binary is the default +when redirecting to a non-terminal output like a file or pipe. + +Unless the `-i` option specifies a specific input format, +each input's format is [automatically inferred](#auto-detection) +and each input is scanned +in the order appearing on the command line forming the input stream. -## Input Formats +### Input Formats `super` currently supports the following input formats: @@ -138,8 +175,8 @@ would produce this output in the default Super JSON format ### JSON Auto-detection: Super vs. Plain -Since [Super JSON](../formats/jsup.md) is a superset of plain JSON, `super` must be careful in whether it -interprets input as either format. While you can always clarify your intent +Since [Super JSON](../formats/jsup.md) is a superset of plain JSON, `super` must be careful how it distinguishes the two cases when performing auto-inference. +While you can always clarify your intent with the `-i jsup` or `-i json`, `super` attempts to "just do the right thing" when you run it with Super JSON vs. plain JSON. @@ -164,7 +201,7 @@ as an outer object or as a value nested somewhere within a JSON array. This heuristic almost always works in practice because Super JSON records typically omit quotes around field names. -## Output Formats +### Output Formats `super` currently supports the following output formats: @@ -270,8 +307,8 @@ or register schemas or "protos" with the downstream entities. In particular, Super Binary data can simply be concatenated together, e.g., ```mdtest-command -super -f bsup -c 'yield 1,[1,2,3]' > a.bsup -super -f bsup -c 'yield {s:"hello"},{s:"world"}' > b.bsup +super -f bsup -c 'select value 1, [1,2,3]' > a.bsup +super -f bsup -c 'select value {s:"hello"}, {s:"world"}' > b.bsup cat a.bsup b.bsup | super -z - ``` produces @@ -283,7 +320,7 @@ produces ``` And while this Super JSON output is human readable, the Super Binary files are binary, e.g., ```mdtest-command -super -f bsup -c 'yield 1,[1,2,3]' > a.bsup +super -f bsup -c 'select value 1,[ 1,2,3]' > a.bsup hexdump -C a.bsup ``` produces @@ -545,7 +582,7 @@ have many examples, but here are a few more simple `super` use cases. _Hello, world_ ```mdtest-command -echo '"hello, world"' | super -z -c 'yield this' - +super -z -c "select value 'hello, world'" ``` produces this Super JSON output ```mdtest-output @@ -554,7 +591,7 @@ produces this Super JSON output _Some values of available [data types](../language/data-types.md)_ ```mdtest-command -echo '1 1.5 [1,"foo"] |["apple","banana"]|' | super -z -c 'yield this' - +echo '1 1.5 [1,"foo"] |["apple","banana"]|' | super -z - ``` produces ```mdtest-output @@ -565,7 +602,7 @@ produces ``` _The types of various data_ ```mdtest-command -echo '1 1.5 [1,"foo"] |["apple","banana"]|' | super -z -c 'yield typeof(this)' - +echo '1 1.5 [1,"foo"] |["apple","banana"]|' | super -z -c 'select value typeof(this)' - ``` produces ```mdtest-output @@ -616,200 +653,728 @@ produces ## Performance -Your mileage may vary, but many new users of `super` are surprised by its speed -compared to tools like `jq`, `grep`, `awk`, or `sqlite` especially when running -`super` over files in the Super Binary format. - -### Fast Pattern Matching - -One important technique that helps `super` run fast is to take advantage of queries -that involve fine-grained searches. +You might think that the overhead involved in managing super-structured types +and the generality of heterogeneous data would confound the performance of +the `super` command, but it turns out that `super` can hold its own when +compared to other analytics systems. -When a query begins with a logical expression containing either a search -or a predicate match with a constant value, and presuming the input data format -is Super Binary, then the runtime optimizes the query by performing an efficient, -byte-oriented "pre-search" of the values required in the predicate. This pre-search -scans the bytes that comprise a large buffer of values and looks for these values -and, if they are not present, the entire buffer is discarded knowing no individual -value in that buffer could match because the required serialized -values were not present in the buffer. +To illustrate comparative performance, we'll present some informal performance +measurements among `super`, +[`DuckDB`](https://duckdb.org/), +[`ClickHouse`](https://clickhouse.com/), and +[`DataFusion`](https://datafusion.apache.org/). -For example, if the query is -``` -"http error" and ipsrc==10.0.0.1 | count() -``` -then the pre-search would look for the string "http error" and the encoding -of the IP address 10.0.0.1 and unless both those values are present, then the -buffer is discarded. +We'll use the Parquet format to compare apples to apples +and also report results for the custom columnar database format of DuckDB +and the Super Binary format used by `super`. +We tried loading our test data into a ClickHouse table using its +[new experimental JSON type](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse) +but those attempts failed with "too many open files". -Moreover, Super Binary data is compressed and arranged into frames that can be decompressed -and processed in parallel. This allows the decompression and pre-search to -run in parallel very efficiently across a large number of threads. When searching -for sparse results, many frames are discarded without their uncompressed bytes -having to be processed any further. +As of this writing in November 2024, we're using the latest version 1.1.3 of `duckdb`. +version 24.11.1.1393 of `clickhouse`, and v43.0.0 of `datafusion-cli`. -### Efficient JSON Processing +### The Test Data -While processing data in the Super Binary format is far more efficient than JSON, -there is substantial JSON data in the world and it is important for JSON -input to perform well. +These tests are based on the data and exemplary queries +published by the DuckDB team on their blog +[Shredding Deeply Nested JSON, One Vector at a Time](https://duckdb.org/2023/03/03/json.html). We'll follow their script starting at the +[GitHub Archive Examples](https://duckdb.org/2023/03/03/json.html#github-archive-examples). -This proved a challenge as `super` is written in [Go](https://go.dev/) and Go's JSON package -is not particularly performant. To this end, `super` has its own lean and simple -[JSON tokenizer](https://pkg.go.dev/github.com/brimdata/super/pkg/jsonlexer), -which performs quite well, -and is -[integrated tightly](https://github.com/brimdata/super/blob/main/zio/jsonio/reader.go) -with SuperDB's internal data representation. -Moreover, like `jq`, -`super`'s JSON parser does not require objects to be newline delimited and can -incrementally parse the input to minimize memory overhead and improve -processor cache performance. - -The net effect is a JSON parser that is typically a bit faster than the -native C implementation in `jq`. - -### Performance Comparisons - -To provide a rough sense of the performance tradeoffs between `super` and -other tooling, this section provides results of a few simple speed tests. - -#### Test Data +If you want to reproduce these results for yourself, +you can fetch the 2.2GB of gzipped JSON data: +``` +wget https://data.gharchive.org/2023-02-08-0.json.gz +wget https://data.gharchive.org/2023-02-08-1.json.gz +... +wget https://data.gharchive.org/2023-02-08-23.json.gz +``` +We downloadied these files into a directory called `gharchive_gz` +and created a duckdb database file called `gha.db` and a table called `gha` +using this command: +``` +duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', union_by_name=true)" +``` +To create a relational table from the input JSON, we utilized DuckDB's +`union_by_name` parameter to fuse all of the different shapes of JSON encountered +into a single monolithic schema. -These tests are easy to reproduce. The input data comes from a -[repository of sample security log data](https://github.com/brimdata/zed-sample-data), -where we used a semi-structured Zeek "conn" log from the `zeek-default` directory. +We then created a Parquet file called `gha.parquet` with this command: +``` +duckdb gha.db -c "COPY (from gha) TO 'gha.parquet'" +``` +To create a super-structed file for the `super` command, there is no need to +fuse the data into a single schema (though `super` can still work with the fused +schema in the Parquet file), and we simply ran this command to create a Super Binary +file: +``` +super gharchive_gz/*.json.gz > gha.bsup +``` +This code path in `super` is not multi-threaded so not particularly performant but, +on our test machine, it takes about the same time as the `duckdb` method of creating +a schema-fused table. -It is easy to convert the Zeek logs to a local Super Binary file using -`super`'s built-in [`get` operator](../language/operators/get.md): +Here are the resulting file sizes: ``` -super -o conn.bsup -c 'get https://raw.githubusercontent.com/brimdata/zed-sample-data/main/zeek-default/conn.log.gz' +% du -h gha.db gha.parquet gha.bsup gharchive_gz +9.3G gha.db +4.6G gha.parquet +2.8G gha.bsup +2.2G gharchive_gz ``` -This creates a new file `conn.bsup` from the Zeek log file fetched from GitHub. -Note that this data is a gzip'd file in the Zeek format and `super`'s auto-detector -figures out both that it is gzip'd and that the uncompressed format is Zeek. -There's no need to specify flags for this. +### The Test Queries + +The test queries involve these patterns: +* simple search (single and multicolumn) +* count-where aggregation +* count by field aggregation +* rank over union of disparate field types -Next, a JSON file can be converted from Super Binary using: +We will call these tests `search`, `search+`, `count`, `agg`, and `union`, respectively + +#### Search + +For the search test, we'll search for the string pattern +``` + "in case you have any feedback 😊" +``` +in the field `payload.pull_request.body` +and we'll just count the number of matches found. +The number of matches is small (3) so the query performance is dominated +by the search. + +The SQL for this query is +```sql +SELECT count() +FROM 'gha.parquet' -- or gha +WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' +``` +SuperSQL has a function called `grep` that is similar to the SQL `LIKE` clause but +can operate over specified fields or default to all the string fields in any value. +The SuperSQL query is +```sql +SELECT count() +FROM 'gha.bsup' +WHERE grep('in case you have any feedback 😊', payload.pull_request.body) ``` -super -f json conn.bsup > conn.json + +#### Search+ + +For search across multiple columns, SQL doesn't have a `grep` function so +we must enumerate all the fields of such a query. The SQL for a string search +over our GitHub Archive dataset involves the following fields: +```sql +SELECT count() FROM gha +WHERE id LIKE '%in case you have any feedback 😊%' + OR type LIKE '%in case you have any feedback 😊%' + OR actor.login LIKE '%in case you have any feedback 😊%' + OR actor.display_login LIKE '%in case you have any feedback 😊%' + ... + OR payload.member.type LIKE '%in case you have any feedback 😊%' ``` -Note here that we lose information in this conversion because the rich data types -of the [super data model](../formats/zed.md) (that were [translated from the Zeek format](../integrations/zeek/data-type-compatibility.md)) are lost. +There are 486 such fields. You can review the entire query in +[docs/commands/search.sql](search.sql). + +#### Count -We'll also make a SQLite database in the file `conn.db` as the table named `conn`. -One easy way to do this is to install -[sqlite-utils](https://sqlite-utils.datasette.io/en/stable/) -and run +In the `count` test, we filter the input with a WHERE clause and count the results. +We chose a random GitHub user name for the filter. +This query has the form: ``` -sqlite-utils insert conn.db conn conn.json --nl +SELECT count() +FROM 'gha.parquet' -- or gha or 'gha.bsup' +WHERE actor.login='johnbieren'" ``` -(If you need a cup of coffee, a good time to get it would be when -loading the JSON into SQLite.) -#### File Sizes +#### Agg -Note the resulting file sizes: +In the `agg` test, we filter the input and count the results grouped by the field `type` +as in the DuckDB blog. +This query has the form: ``` -% du -h conn.json conn.db conn.bsup -416M conn.json -192M conn.db - 38M conn.bsup +SELECT count(),type +FROM 'gha.parquet' -- or 'gha' or 'gha.bsup' +WHERE repo.name='duckdb/duckdb' +GROUP BY type ``` -Much of the performance of Super Binary derives from an efficient, parallelizable -structure where frames of data are compressed -(currently with [LZ4](http://lz4.github.io/lz4/) though the -specification supports multiple algorithms) and the sequence of values -can be processed with only partial deserialization. -That said, there are quite a few more opportunities to further improve -the performance of `super` and the SuperDB system and we have a number of projects -forthcoming on this front. +#### Union + +The `union` test is straight out of the DuckDB blog at the end of +[this section](https://duckdb.org/2023/03/03/json.html#handling-inconsistent-json-schemas). +This query computes the GitHub users that were assigned as a PR reviewer the most often +and returns the top 5 such users. +Because the assignees can appear in either a list of strings +or within a single string field, the relational model requires that two different +subqueries run for the two cases and the result unioned together; then, +this intermediary table can be counted using the unnested +assignee as the group-by key. +This query is: +```sql +WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM 'gha.parquet' + UNION ALL + SELECT unnest(payload.pull_request.assignees).login assignee + FROM 'gha.parquet' +) +SELECT assignee, count(*) count +FROM assignees +WHERE assignee IS NOT NULL +GROUP BY assignee +ORDER BY count DESC +LIMIT 5 +``` +For DataFusion, we needed to rewrite this SELECT +```sql +SELECT unnest(payload.pull_request.assignees).login +FROM 'gha.parquet' +``` +as +```sql +SELECT rec.login as assignee FROM ( + SELECT unnest(payload.pull_request.assignees) rec + FROM 'gha.parquet' +) +``` +and for ClickHouse, we had to use `arrayJoin` instead of `unnest`. + +SuperSQL's data model does not require these sorts of gymnastics as +everything does not have to be jammed into a table. Instead, we can use the +`UNNEST` pipe operator combined with the spread operator applied to the array of +string fields to easily produce a stream of string values representing the +assignees. Then we simply aggregate the assignee stream: +``` +FROM 'gha.bsup' +| UNNEST [...payload.pull_request.assignees, payload.pull_request.assignee] +| WHERE this IS NOT NULL +| AGGREGATE count() BY assignee:=login +| ORDER BY count DESC +| LIMIT 5 +``` + +### The Test Results + +The following table summarizes the results of each test as a column and +each tool as a row with the speed-up factor shown in parentheses: + +| tool | format | search | search+ | count | agg | union | +|--------------|---------------|---------------|---------------|----|------|-------| +| `super` | `bsup` | 3.2 (2.6X) | 6.7 (3.6X) | 3.2 (0.04X) | 3.1 (0.04X) | 3.8 (117X) | +| `super` | `parquet` | note 1 | note 1 | 0.18 (0.7X) | 0.27 (0.4X) | note 2 | +| `duckdb` | `db` | 8.2 | 24 | 0.13 | 0.12 | 446 | +| `duckdb` | `parquet` | 8.4 (1) | 23 (1X) | 0.26 (0.5X) | 0.21 (0.6X) | 419 (1.1X) | +| `datafusion` | `parquet` | 9.1 (0.9X) | 18 (1.3X) | 0.24 (0.5X) | 0.24 (0.5X) | 40 (11x) | +| `clickhouse` | `parquet` | 56 (0.1X) | 463 (0.1X) | 1 (0.1X) | 0.91 (0.1X) | 66 (7X) | + +_Note 1: the `super` vectorized runtime does not yet support `grep`_ + +_Note 2: the `super` vectorized runtime does not yet support array expressions_ -#### Tests +Since DuckDB with its native format is overall the best performing, +we used it as the baseline for all of the speedup factors. -We ran three styles of tests on a Mac quad-core 2.3GHz i7: -* `count` - compute the number of values present -* `search` - find a value in a field -* `agg` - sum a field grouped by another field +To summarize, +`super` with Super Binary is substantially faster than the relational systems for +the search use cases and performs on par with the others for traditional OLAP queries, +except for the union query, where the super-structured data model trounces the relational +model (by over 100X!) for stiching together disparate data types for analysis in an aggregation. -Each test was run for `jq`, `super` on JSON, `sqlite3`, and `super` on Super Binary. +## Appendix 1: Preparing the Test Data -We used the Bash `time` command to measure elapsed time. +For our tests, We diverged a bit from the methodology in the DuckDB blog and wanted +to put all the JSON data in a single table. It wasn't obvious how to go about this +and this section documents the difficulties we encountered trying to do so. -The command lines for the `count` test were: +First, we simply tried this: ``` -jq -s length conn.json -sqlite3 conn.db 'select count(*) from conn' -super -c 'count()' conn.bsup -super -c 'count()' conn.json +duckdb gha.db -c "CREATE TABLE gha AS FROM 'gharchive_gz/*.json.gz'" ``` -The command lines for the `search` test were: +which fails with ``` -jq 'select(.id.orig_h=="10.47.23.5")' conn.json -sqlite3 conn.db 'select * from conn where json_extract(id, "$.orig_h")=="10.47.23.5"' -super -c 'id.orig_h==10.47.23.5' conn.bsup -super -c 'id.orig_h==10.47.23.5' conn.json +Invalid Input Error: JSON transform error in file "gharchive_gz/2023-02-08-10.json.gz", in line 4903: Object {"url":"https://api.github.com/repos/aws/aws-sam-c... has unknown key "reactions" +Try increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or 'records' manually, setting 'ignore_errors' to true, or setting 'union_by_name' to true when reading multiple files with a different structure. ``` -Here, we look for an IP address (10.47.23.5) in a specific -field `id.orig_h` in the semi-structured data. Note when using Super Binary, -the IP is a native type whereas for `jq` and SQLite it is a string. -Note that `sqlite` must use its `json_extract` function since nested JSON objects -are stored as minified JSON text. +Clearly the schema inference algorithm relies upon sampling and the sample doesn't +cover enough data to capture all of its variations. -The command lines for the `agg` test were: +Okay, maybe there is a reason the blog first explores the structure of +the data to specify `columns` arguments to `read_json` as suggested by the error +message above. To this end, you can run this query: ``` -jq -n -f agg.jq conn.json -sqlite3 conn.db 'select sum(orig_bytes),json_extract(id, "$.orig_h") as orig_h from conn group by orig_h' -super -c "sum(orig_bytes) by id.orig_h" conn.bsup -super -c "sum(orig_bytes) by id.orig_h" conn.json +SELECT json_group_structure(json) +FROM ( + SELECT * + FROM read_ndjson_objects('gharchive_gz/*.json.gz') + LIMIT 2048 +); ``` -where the `agg.jq` script is: +Unfortunately, if you use the resulting structure to create the `columns` argument +then `duckdb` fails also because the first 2048 records don't have enough coverage. +So let's try removing the `LIMIT` clause: ``` -def adder(stream): - reduce stream as $s ({}; .[$s.key] += $s.val); -adder(inputs | {key:.id.orig_h,val:.orig_bytes}) -| to_entries[] -| {orig_h: (.key), sum: .value} +SELECT json_group_structure(json) +FROM ( + SELECT * + FROM read_ndjson_objects('gharchive_gz/*.json.gz') +); ``` +Hmm, now `duckdb` runs out of memory. -#### Results +We then thought we'd see if the sampling algorithm of `read_json` is more efficient, +so we ran tried this command with successively larger sample sizes: +``` +duckdb scratch -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', sample_size=1000000)" +``` +even with a million rows as the sample, `duckdb` fails with +``` +Invalid Input Error: JSON transform error in file "gharchive_gz/2023-02-08-14.json.gz", in line 49745: Object {"issues":"write","metadata":"read","pull_requests... has unknown key "repository_hooks" +Try increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or 'records' manually, setting 'ignore_errors' to true, or setting 'union_by_name' to true when reading multiple files with a different structure. +``` +Ok, there 4434953 JSON objects in the input so let's try this: +``` +duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', sample_size=4434953)" +``` +and again `duckdb` runs out of memory. -The following table summarizes the results of each test as a column and -each tool as a row with the speed-up factor (relative to `jq`) -shown in parentheses: - -| | `count` | `search` | `agg` | -|------|---------------|---------------|---------------| -| `jq` | 11,540ms (1X) | 10,730ms (1X) | 20,175ms (1X) | -| `super-json` | 7,150ms (1.6X) | 7,230ms (1.5X) | 7,390ms (2.7X) | -| `sqlite` | 100ms (115X) | 620ms (17X) | 1,475ms (14X) | -| `super-bsup` | 110ms (105X) | 135ms (80X) | 475ms (42X) | - -To summarize, `super` with Super Binary is consistently fastest though `sqlite` -was a bit faster counting rows. - -In particular, `super` is substantially faster (40-100X) than `jq` with the efficient -Super Binary format but more modestly faster (50-170%) when processing the bulky JSON input. -This is expected because parsing JSON becomes the bottleneck. - -While SQLite is much faster than `jq`, it is not as fast as `super`. The primary -reason for this is that SQLite stores its semi-structured columns as minified JSON text, -so it must scan and parse the JSON when executing the _where_ clause above -as well as the aggregated fields. - -Also, note that the inferior performance of `sqlite` is in areas where databases -perform extraordinarily well if you do the work to -(1) transform semi-structured columns to relational columns by flattening -nested JSON objects (which are not indexable by `sqlite`) and -(2) configuring database indexes. - -In fact, if you implement these changes, `sqlite` performs better than `super` on these tests. - -However, the benefit of SuperDB is that no flattening is required. And unlike `sqlite`, -`super` is not intended to be a database. That said, there is no reason why database -performance techniques cannot be applied to the super data model and this is precisely what the -open-source SuperDB project intends to do. - -Stay tuned! +So we looked at the other options suggested by the error message and +`union_by_name` appeared promising. Enabling this option causes DuckDB +to combine all the JSON objects into a single fused schema. +Maybe this would work better? + +Sure enough, this works: +``` +duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', union_by_name=true)" +``` +We now have the `duckdb` database file for out GitHub Archive data called `gha.db` +containing a single table called `gha` embedded in that database. +What about the super-structured +format for the `super` command? There is no need to futz with sample sizes, +schema inference, or union by name, just run this to create a Super Binary file: +``` +super gharchive_gz/*.json.gz > gha.bsup +``` + +## Appendix 2: Running the Tests + +This appendix provides the raw tests and output that we run on a MacBook Pro to generate +the table of results above. + +### Search Test + +``` +; time super -c " + SELECT count() + FROM 'gha.bsup' + WHERE grep('in case you have any feedback 😊', payload.pull_request.body) +" +{count:2(uint64)} +super -c 12.70s user 0.69s system 415% cpu 3.223 total + +time duckdb gha.db -c " + SELECT count() + FROM gha + WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' +" +┌──────────────┐ +│ count_star() │ +│ int64 │ +├──────────────┤ +│ 2 │ +└──────────────┘ +duckdb gha.db -c 26.66s user 6.90s system 406% cpu 8.266 total + +; time duckdb -c " + SELECT count() + FROM gha.parquet + WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' +" +┌──────────────┐ +│ count_star() │ +│ int64 │ +├──────────────┤ +│ 2 │ +└──────────────┘ +duckdb -c 42.71s user 6.06s system 582% cpu 8.380 total + +; time datafusion-cli -c " + SELECT count() + FROM 'gha.parquet' + WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' +" +DataFusion CLI v43.0.0 ++---------+ +| count() | ++---------+ +| 2 | ++---------+ +1 row(s) fetched. +Elapsed 8.819 seconds. + +datafusion-cli -c 40.75s user 6.72s system 521% cpu 9.106 total + +; time clickhouse -q " + SELECT count() + FROM 'gha.parquet' + WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' +" +2 +clickhouse -q 50.81s user 1.83s system 94% cpu 55.994 total +``` + +### Search+ Test + +``` +; time super -c " + SELECT count() + FROM 'gha.bsup' + WHERE grep('in case you have any feedback 😊') +" +{count:3(uint64)} +super -c 43.80s user 0.71s system 669% cpu 6.653 total + +; time duckdb gha.db < search.sql +┌──────────────┐ +│ count_star() │ +│ int64 │ +├──────────────┤ +│ 3 │ +└──────────────┘ +duckdb gha.db < search.sql 73.60s user 33.29s system 435% cpu 24.563 total + +; time duckdb < search-parquet.sql +┌──────────────┐ +│ count_star() │ +│ int64 │ +├──────────────┤ +│ 3 │ +└──────────────┘ +duckdb < search-parquet.sql 89.57s user 29.21s system 513% cpu 23.113 total + +; time datafusion-cli -f search-parquet.sql +DataFusion CLI v43.0.0 ++---------+ +| count() | ++---------+ +| 3 | ++---------+ +1 row(s) fetched. +Elapsed 18.184 seconds. +datafusion-cli -f search-parquet.sql 83.84s user 11.13s system 513% cpu 18.494 total + +; time clickhouse --queries-file search-parquet.sql +3 +clickhouse --queries-file search-parquet.sql 515.68s user 5.50s system 112% cpu 7:43.37 total +``` +### Count Test + +``` +; time super -c " + SELECT count() + FROM 'gha.bsup' + WHERE actor.login='johnbieren' +" +{count:879(uint64)} +super -c 13.81s user 0.71s system 449% cpu 3.233 total + +; time SUPER_VAM=1 super -c " + SELECT count() + FROM 'gha.parquet' + WHERE actor.login='johnbieren' +" +{count:879(uint64)} +SUPER_VAM=1 super -c 0.43s user 0.08s system 277% cpu 0.182 total + +; time duckdb gha.db -c " + SELECT count() + FROM gha + WHERE actor.login='johnbieren' +" +┌──────────────┐ +│ count_star() │ +│ int64 │ +├──────────────┤ +│ 879 │ +└──────────────┘ +duckdb gha.db -c 0.64s user 0.06s system 517% cpu 0.134 total + +; time duckdb -c " + SELECT count() + FROM 'gha.parquet' + WHERE actor.login='johnbieren' +" +┌──────────────┐ +│ count_star() │ +│ int64 │ +├──────────────┤ +│ 879 │ +└──────────────┘ +duckdb gha.db -c 1.14s user 0.14s system 490% cpu 0.261 total + +DataFusion CLI v43.0.0 ++---------+ +| count() | ++---------+ +| 879 | ++---------+ +1 row(s) fetched. +Elapsed 0.203 seconds. + +datafusion-cli -c 0.93s user 0.15s system 453% cpu 0.238 total + +; time clickhouse -q " + SELECT count() + FROM 'gha.parquet' + WHERE actor.login='johnbieren' +" +879 +clickhouse -q 0.86s user 0.07s system 93% cpu 1.001 total +``` + +### Agg Test + +``` +; time super -c " + SELECT count(),type + FROM 'gha.bsup' + WHERE repo.name='duckdb/duckdb' + GROUP BY type +" +{type:"PullRequestReviewEvent",count:14(uint64)} +{type:"IssueCommentEvent",count:30(uint64)} +{type:"WatchEvent",count:29(uint64)} +{type:"PullRequestEvent",count:35(uint64)} +{type:"PushEvent",count:15(uint64)} +{type:"IssuesEvent",count:9(uint64)} +{type:"ForkEvent",count:3(uint64)} +{type:"PullRequestReviewCommentEvent",count:7(uint64)} +super -c 12.24s user 0.68s system 413% cpu 3.129 total + +; time SUPER_VAM=1 super -c " + SELECT count(),type + FROM 'gha.parquet' + WHERE repo.name='duckdb/duckdb' + GROUP BY type +" +{type:"IssueCommentEvent",count:30(uint64)} +{type:"PullRequestEvent",count:35(uint64)} +{type:"PushEvent",count:15(uint64)} +{type:"WatchEvent",count:29(uint64)} +{type:"PullRequestReviewEvent",count:14(uint64)} +{type:"ForkEvent",count:3(uint64)} +{type:"PullRequestReviewCommentEvent",count:7(uint64)} +{type:"IssuesEvent",count:9(uint64)} +SUPER_VAM=1 super -c 1.01s user 0.13s system 421% cpu 0.271 total + +; time duckdb gha.db -c " + SELECT count(),type + FROM gha + WHERE repo.name='duckdb/duckdb' + GROUP BY type +" +┌──────────────┬───────────────────────────────┐ +│ count_star() │ type │ +│ int64 │ varchar │ +├──────────────┼───────────────────────────────┤ +│ 3 │ ForkEvent │ +│ 35 │ PullRequestEvent │ +│ 29 │ WatchEvent │ +│ 7 │ PullRequestReviewCommentEvent │ +│ 15 │ PushEvent │ +│ 9 │ IssuesEvent │ +│ 14 │ PullRequestReviewEvent │ +│ 30 │ IssueCommentEvent │ +└──────────────┴───────────────────────────────┘ +duckdb gha.db -c 0.49s user 0.06s system 466% cpu 0.119 total + +; time duckdb -c " + SELECT count(),type + FROM 'gha.parquet' + WHERE repo.name='duckdb/duckdb' + GROUP BY type +" +┌──────────────┬───────────────────────────────┐ +│ count_star() │ type │ +│ int64 │ varchar │ +├──────────────┼───────────────────────────────┤ +│ 9 │ IssuesEvent │ +│ 7 │ PullRequestReviewCommentEvent │ +│ 15 │ PushEvent │ +│ 14 │ PullRequestReviewEvent │ +│ 3 │ ForkEvent │ +│ 29 │ WatchEvent │ +│ 35 │ PullRequestEvent │ +│ 30 │ IssueCommentEvent │ +└──────────────┴───────────────────────────────┘ +duckdb -c 0.73s user 0.14s system 413% cpu 0.211 total + +; time datafusion-cli -c " + SELECT count(),type + FROM 'gha.parquet' + WHERE repo.name='duckdb/duckdb' + GROUP BY type +" +DataFusion CLI v43.0.0 ++---------+-------------------------------+ +| count() | type | ++---------+-------------------------------+ +| 15 | PushEvent | +| 35 | PullRequestEvent | +| 7 | PullRequestReviewCommentEvent | +| 14 | PullRequestReviewEvent | +| 30 | IssueCommentEvent | +| 9 | IssuesEvent | +| 29 | WatchEvent | +| 3 | ForkEvent | ++---------+-------------------------------+ +8 row(s) fetched. +Elapsed 0.200 seconds. + +datafusion-cli -c 0.80s user 0.15s system 398% cpu 0.238 total + +; time clickhouse -q " + SELECT count(),type + FROM 'gha.parquet' + WHERE repo.name='duckdb/duckdb' + GROUP BY type +" +30 IssueCommentEvent +14 PullRequestReviewEvent +15 PushEvent +29 WatchEvent +9 IssuesEvent +7 PullRequestReviewCommentEvent +3 ForkEvent +35 PullRequestEvent +clickhouse -q 0.77s user 0.11s system 97% cpu 0.908 total +``` + +### Union Test + +``` +time super -c " + FROM 'gha.bsup' + | SELECT VALUE payload.pull_request + | WHERE this IS NOT NULL + | UNNEST [...assignees, assignee] + | WHERE this IS NOT NULL + | AGGREGATE count() BY assignee:=login + | ORDER BY count DESC + | LIMIT 5 +" +{assignee:"poad",count:1966(uint64)} +{assignee:"vinayakkulkarni",count:508(uint64)} +{assignee:"tmtmtmtm",count:356(uint64)} +{assignee:"AMatutat",count:260(uint64)} +{assignee:"danwinship",count:208(uint64)} +super -c 12.39s user 0.95s system 351% cpu 3.797 total + +; time duckdb gha.db -c " + WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM gha + UNION ALL + SELECT unnest(payload.pull_request.assignees).login assignee + FROM gha + ) + SELECT assignee, count(*) count + FROM assignees + WHERE assignee NOT NULL + GROUP BY assignee + ORDER BY count DESC + LIMIT 5 +" +┌─────────────────┬───────┐ +│ assignee │ count │ +│ varchar │ int64 │ +├─────────────────┼───────┤ +│ poad │ 1966 │ +│ vinayakkulkarni │ 508 │ +│ tmtmtmtm │ 356 │ +│ AMatutat │ 260 │ +│ danwinship │ 208 │ +└─────────────────┴───────┘ +duckdb gha.db -c 3119.93s user 90.86s system 719% cpu 7:26.22 total + +time duckdb -c " + WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM 'gha.parquet' + UNION ALL + SELECT unnest(payload.pull_request.assignees).login assignee + FROM 'gha.parquet' + ) + SELECT assignee, count(*) count + FROM assignees + WHERE assignee NOT NULL + GROUP BY assignee + ORDER BY count DESC + LIMIT 5 +" +┌─────────────────┬───────┐ +│ assignee │ count │ +│ varchar │ int64 │ +├─────────────────┼───────┤ +│ poad │ 1966 │ +│ vinayakkulkarni │ 508 │ +│ tmtmtmtm │ 356 │ +│ AMatutat │ 260 │ +│ danwinship │ 208 │ +└─────────────────┴───────┘ +duckdb -c 2914.72s user 107.15s system 721% cpu 6:58.68 total + +time datafusion-cli -c " + WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM 'gha.parquet' + UNION ALL + SELECT object.login as assignee FROM ( + SELECT unnest(payload.pull_request.assignees) object + FROM 'gha.parquet' + ) + ) + SELECT assignee, count() count + FROM assignees + WHERE assignee IS NOT NULL + GROUP BY assignee + ORDER BY count DESC + LIMIT 5 +" +DataFusion CLI v43.0.0 ++-----------------+-------+ +| assignee | count | ++-----------------+-------+ +| poad | 1966 | +| vinayakkulkarni | 508 | +| tmtmtmtm | 356 | +| AMatutat | 260 | +| danwinship | 208 | ++-----------------+-------+ +5 row(s) fetched. +Elapsed 39.012 seconds. + +datafusion-cli -c 116.97s user 44.50s system 408% cpu 39.533 total + +; time clickhouse -q " + WITH assignees AS ( + SELECT payload.pull_request.assignee.login assignee + FROM 'gha.parquet' + UNION ALL + SELECT arrayJoin(payload.pull_request.assignees).login assignee + FROM 'gha.parquet' + ) + SELECT assignee, count(*) count + FROM assignees + WHERE assignee IS NOT NULL + GROUP BY assignee + ORDER BY count DESC + LIMIT 5 +" +poad 1966 +vinayakkulkarni 508 +tmtmtmtm 356 +AMatutat 260 +danwinship 208 +clickhouse -q 105.49s user 6.54s system 169% cpu 1:06.27 total +``` From 29b5e20612946571d739d7c81482160cc11393d3 Mon Sep 17 00:00:00 2001 From: Steven McCanne Date: Thu, 14 Nov 2024 15:44:54 -0800 Subject: [PATCH 2/4] fix trailing whitespace --- docs/commands/super.md | 52 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/commands/super.md b/docs/commands/super.md index 7329797807..389ccae586 100644 --- a/docs/commands/super.md +++ b/docs/commands/super.md @@ -655,7 +655,7 @@ produces You might think that the overhead involved in managing super-structured types and the generality of heterogeneous data would confound the performance of -the `super` command, but it turns out that `super` can hold its own when +the `super` command, but it turns out that `super` can hold its own when compared to other analytics systems. To illustrate comparative performance, we'll present some informal performance @@ -667,7 +667,7 @@ measurements among `super`, We'll use the Parquet format to compare apples to apples and also report results for the custom columnar database format of DuckDB and the Super Binary format used by `super`. -We tried loading our test data into a ClickHouse table using its +We tried loading our test data into a ClickHouse table using its [new experimental JSON type](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse) but those attempts failed with "too many open files". @@ -690,7 +690,7 @@ wget https://data.gharchive.org/2023-02-08-1.json.gz wget https://data.gharchive.org/2023-02-08-23.json.gz ``` We downloadied these files into a directory called `gharchive_gz` -and created a duckdb database file called `gha.db` and a table called `gha` +and created a duckdb database file called `gha.db` and a table called `gha` using this command: ``` duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', union_by_name=true)" @@ -703,7 +703,7 @@ We then created a Parquet file called `gha.parquet` with this command: ``` duckdb gha.db -c "COPY (from gha) TO 'gha.parquet'" ``` -To create a super-structed file for the `super` command, there is no need to +To create a super-structed file for the `super` command, there is no need to fuse the data into a single schema (though `super` can still work with the fused schema in the Parquet file), and we simply ran this command to create a Super Binary file: @@ -744,7 +744,7 @@ and we'll just count the number of matches found. The number of matches is small (3) so the query performance is dominated by the search. -The SQL for this query is +The SQL for this query is ```sql SELECT count() FROM 'gha.parquet' -- or gha @@ -761,8 +761,8 @@ WHERE grep('in case you have any feedback 😊', payload.pull_request.body) #### Search+ -For search across multiple columns, SQL doesn't have a `grep` function so -we must enumerate all the fields of such a query. The SQL for a string search +For search across multiple columns, SQL doesn't have a `grep` function so +we must enumerate all the fields of such a query. The SQL for a string search over our GitHub Archive dataset involves the following fields: ```sql SELECT count() FROM gha @@ -795,7 +795,7 @@ This query has the form: ``` SELECT count(),type FROM 'gha.parquet' -- or 'gha' or 'gha.bsup' -WHERE repo.name='duckdb/duckdb' +WHERE repo.name='duckdb/duckdb' GROUP BY type ``` @@ -806,7 +806,7 @@ The `union` test is straight out of the DuckDB blog at the end of This query computes the GitHub users that were assigned as a PR reviewer the most often and returns the top 5 such users. Because the assignees can appear in either a list of strings -or within a single string field, the relational model requires that two different +or within a single string field, the relational model requires that two different subqueries run for the two cases and the result unioned together; then, this intermediary table can be counted using the unnested assignee as the group-by key. @@ -831,7 +831,7 @@ For DataFusion, we needed to rewrite this SELECT SELECT unnest(payload.pull_request.assignees).login FROM 'gha.parquet' ``` -as +as ```sql SELECT rec.login as assignee FROM ( SELECT unnest(payload.pull_request.assignees) rec @@ -840,8 +840,8 @@ SELECT rec.login as assignee FROM ( ``` and for ClickHouse, we had to use `arrayJoin` instead of `unnest`. -SuperSQL's data model does not require these sorts of gymnastics as -everything does not have to be jammed into a table. Instead, we can use the +SuperSQL's data model does not require these sorts of gymnastics as +everything does not have to be jammed into a table. Instead, we can use the `UNNEST` pipe operator combined with the spread operator applied to the array of string fields to easily produce a stream of string values representing the assignees. Then we simply aggregate the assignee stream: @@ -876,7 +876,7 @@ Since DuckDB with its native format is overall the best performing, we used it as the baseline for all of the speedup factors. To summarize, -`super` with Super Binary is substantially faster than the relational systems for +`super` with Super Binary is substantially faster than the relational systems for the search use cases and performs on par with the others for traditional OLAP queries, except for the union query, where the super-structured data model trounces the relational model (by over 100X!) for stiching together disparate data types for analysis in an aggregation. @@ -884,7 +884,7 @@ model (by over 100X!) for stiching together disparate data types for analysis in ## Appendix 1: Preparing the Test Data For our tests, We diverged a bit from the methodology in the DuckDB blog and wanted -to put all the JSON data in a single table. It wasn't obvious how to go about this +to put all the JSON data in a single table. It wasn't obvious how to go about this and this section documents the difficulties we encountered trying to do so. First, we simply tried this: @@ -939,8 +939,8 @@ duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', s and again `duckdb` runs out of memory. So we looked at the other options suggested by the error message and -`union_by_name` appeared promising. Enabling this option causes DuckDB -to combine all the JSON objects into a single fused schema. +`union_by_name` appeared promising. Enabling this option causes DuckDB +to combine all the JSON objects into a single fused schema. Maybe this would work better? Sure enough, this works: @@ -976,7 +976,7 @@ time duckdb gha.db -c " SELECT count() FROM gha WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' -" +" ┌──────────────┐ │ count_star() │ │ int64 │ @@ -989,7 +989,7 @@ duckdb gha.db -c 26.66s user 6.90s system 406% cpu 8.266 total SELECT count() FROM gha.parquet WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' -" +" ┌──────────────┐ │ count_star() │ │ int64 │ @@ -998,7 +998,7 @@ duckdb gha.db -c 26.66s user 6.90s system 406% cpu 8.266 total └──────────────┘ duckdb -c 42.71s user 6.06s system 582% cpu 8.380 total -; time datafusion-cli -c " +; time datafusion-cli -c " SELECT count() FROM 'gha.parquet' WHERE payload.pull_request.body LIKE '%in case you have any feedback 😊%' @@ -1009,7 +1009,7 @@ DataFusion CLI v43.0.0 +---------+ | 2 | +---------+ -1 row(s) fetched. +1 row(s) fetched. Elapsed 8.819 seconds. datafusion-cli -c 40.75s user 6.72s system 521% cpu 9.106 total @@ -1030,7 +1030,7 @@ clickhouse -q 50.81s user 1.83s system 94% cpu 55.994 total SELECT count() FROM 'gha.bsup' WHERE grep('in case you have any feedback 😊') -" +" {count:3(uint64)} super -c 43.80s user 0.71s system 669% cpu 6.653 total @@ -1043,7 +1043,7 @@ super -c 43.80s user 0.71s system 669% cpu 6.653 total └──────────────┘ duckdb gha.db < search.sql 73.60s user 33.29s system 435% cpu 24.563 total -; time duckdb < search-parquet.sql +; time duckdb < search-parquet.sql ┌──────────────┐ │ count_star() │ │ int64 │ @@ -1059,7 +1059,7 @@ DataFusion CLI v43.0.0 +---------+ | 3 | +---------+ -1 row(s) fetched. +1 row(s) fetched. Elapsed 18.184 seconds. datafusion-cli -f search-parquet.sql 83.84s user 11.13s system 513% cpu 18.494 total @@ -1118,7 +1118,7 @@ DataFusion CLI v43.0.0 +---------+ | 879 | +---------+ -1 row(s) fetched. +1 row(s) fetched. Elapsed 0.203 seconds. datafusion-cli -c 0.93s user 0.15s system 453% cpu 0.238 total @@ -1228,7 +1228,7 @@ DataFusion CLI v43.0.0 | 29 | WatchEvent | | 3 | ForkEvent | +---------+-------------------------------+ -8 row(s) fetched. +8 row(s) fetched. Elapsed 0.200 seconds. datafusion-cli -c 0.80s user 0.15s system 398% cpu 0.238 total @@ -1351,7 +1351,7 @@ DataFusion CLI v43.0.0 | AMatutat | 260 | | danwinship | 208 | +-----------------+-------+ -5 row(s) fetched. +5 row(s) fetched. Elapsed 39.012 seconds. datafusion-cli -c 116.97s user 44.50s system 408% cpu 39.533 total From 4c76764dfbcf97fed7bc0c7533db779888d30467 Mon Sep 17 00:00:00 2001 From: Steven McCanne Date: Thu, 14 Nov 2024 15:49:42 -0800 Subject: [PATCH 3/4] fix markdown link --- docs/commands/super.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/commands/super.md b/docs/commands/super.md index 389ccae586..e3a4e0afc4 100644 --- a/docs/commands/super.md +++ b/docs/commands/super.md @@ -28,7 +28,7 @@ a log-search experience despite its SQL foundation. The `super` command works with data from ephemeral sources like files and URLs. If you want to persist your data into a data lake for persistent storage, -check out the [`super db`](zed.md) set of commands. +check out the [`super db`](super-db.md) set of commands. By invoking the `-c` option, a query expressed in the [SuperSQL language](../language/README.md) may be specified and applied to the input stream. From 0321a1cd33b02a8d0b65237af1c4dc3bf36ab799 Mon Sep 17 00:00:00 2001 From: Steven McCanne Date: Fri, 15 Nov 2024 11:38:36 -0800 Subject: [PATCH 4/4] address PR feedback --- docs/commands/super.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/commands/super.md b/docs/commands/super.md index e3a4e0afc4..41eaa2d839 100644 --- a/docs/commands/super.md +++ b/docs/commands/super.md @@ -73,7 +73,7 @@ providing a convenient means to convert files from one format to another, e.g., super -f arrows file1.json file2.parquet file3.csv > file-combined.arrows ``` When `super` is run with a query that has no "from" operator and no input arguments, -the SuperSQL query is fed a single `null` value analagous to SQL's default +the SuperSQL query is fed a single `null` value analogous to SQL's default input of a single empty row of an unnamed table. This provides a convenient means to explore examples or run in a "calculator mode", e.g., @@ -94,7 +94,7 @@ is equivalent to select value 1+1 ``` To learn more about shortcuts, refer to the SuperSQL -[documenation on shortcuts](../language/pipeline-model.md#implied-operators). +[documentation on shortcuts](../language/pipeline-model.md#implied-operators). For built-in command help and a listing of all available options, simply run `super` with no arguments. @@ -104,7 +104,7 @@ simply run `super` with no arguments. `super` supports a number of [input](#input-formats) and [output](#output-formats) formats, but the super formats ([Super Binary](../formats/bsup.md), [Super Columnar](../formats/csup.md), -and [Super JSON](../formats/jsup.md)) tend to the most versatile and +and [Super JSON](../formats/jsup.md)) tend to be the most versatile and easy to work with. `super` typically operates on binary-encoded data and when you want to inspect @@ -689,7 +689,7 @@ wget https://data.gharchive.org/2023-02-08-1.json.gz ... wget https://data.gharchive.org/2023-02-08-23.json.gz ``` -We downloadied these files into a directory called `gharchive_gz` +We downloaded these files into a directory called `gharchive_gz` and created a duckdb database file called `gha.db` and a table called `gha` using this command: ``` @@ -879,11 +879,11 @@ To summarize, `super` with Super Binary is substantially faster than the relational systems for the search use cases and performs on par with the others for traditional OLAP queries, except for the union query, where the super-structured data model trounces the relational -model (by over 100X!) for stiching together disparate data types for analysis in an aggregation. +model (by over 100X!) for stitching together disparate data types for analysis in an aggregation. ## Appendix 1: Preparing the Test Data -For our tests, We diverged a bit from the methodology in the DuckDB blog and wanted +For our tests, we diverged a bit from the methodology in the DuckDB blog and wanted to put all the JSON data in a single table. It wasn't obvious how to go about this and this section documents the difficulties we encountered trying to do so. @@ -947,7 +947,7 @@ Sure enough, this works: ``` duckdb gha.db -c "CREATE TABLE gha AS FROM read_json('gharchive_gz/*.json.gz', union_by_name=true)" ``` -We now have the `duckdb` database file for out GitHub Archive data called `gha.db` +We now have the `duckdb` database file for our GitHub Archive data called `gha.db` containing a single table called `gha` embedded in that database. What about the super-structured format for the `super` command? There is no need to futz with sample sizes,