From b9e3e46967f57cffaedc5e89f18a763fabfbf327 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Mon, 19 Feb 2024 10:40:44 -0500 Subject: [PATCH 1/3] Add test script to generate people --- .github/actions/people/Dockerfile | 7 + .github/actions/people/action.yml | 13 + .github/actions/people/app/main.py | 621 +++++++++++++++++++++++++++++ 3 files changed, 641 insertions(+) create mode 100644 .github/actions/people/Dockerfile create mode 100644 .github/actions/people/action.yml create mode 100644 .github/actions/people/app/main.py diff --git a/.github/actions/people/Dockerfile b/.github/actions/people/Dockerfile new file mode 100644 index 0000000000000..1455106bde3c8 --- /dev/null +++ b/.github/actions/people/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.9 + +RUN pip install httpx PyGithub "pydantic==2.0.2" pydantic-settings "pyyaml>=5.3.1,<6.0.0" + +COPY ./app /app + +CMD ["python", "/app/main.py"] diff --git a/.github/actions/people/action.yml b/.github/actions/people/action.yml new file mode 100644 index 0000000000000..e7f135c398a9b --- /dev/null +++ b/.github/actions/people/action.yml @@ -0,0 +1,13 @@ +# This action was adapted from "Sebastián Ramírez " +# From the FastAPI project. +# See for more details: +# https://github.com/tiangolo/fastapi/tree/master/github/actions/people +name: "Generate LangChain People" +description: "Generate the data for the LangChain People pages" +inputs: + token: + description: 'User token, to read the GitHub API. Can be passed in using {{ secrets.LANGCHAIN_PEOPLE }}' + required: true +runs: + using: 'docker' + image: 'Dockerfile' diff --git a/.github/actions/people/app/main.py b/.github/actions/people/app/main.py new file mode 100644 index 0000000000000..a9d35faeccac1 --- /dev/null +++ b/.github/actions/people/app/main.py @@ -0,0 +1,621 @@ +# This action was adapted from "Sebastián Ramírez " +# From the FastAPI project. +# See for more details: +# https://github.com/tiangolo/fastapi/tree/master/github/actions/people +import logging +import subprocess +import sys +from collections import Counter +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Container, Dict, List, Set, Union + +import httpx +import yaml +from github import Github +from pydantic import BaseModel, SecretStr +from pydantic_settings import BaseSettings + +GITHUB_GRAPHQL_URL = "https://api.github.com/graphql" +# Need to look up the questions category ID. +# For now this is just a placeholder. +QUESTIONS_CATEGORY_ID = "[placeholder]" + +DISCUSSIONS_QUERY = """ +query Q($after: String, $category_id: ID) { + repository(name: "langchain", owner: "langchain-ai") { + discussions(first: 100, after: $after, categoryId: $category_id) { + edges { + cursor + node { + number + author { + login + avatarUrl + url + } + title + createdAt + comments(first: 100) { + nodes { + createdAt + author { + login + avatarUrl + url + } + isAnswer + replies(first: 10) { + nodes { + createdAt + author { + login + avatarUrl + url + } + } + } + } + } + } + } + } + } +} +""" + + +ISSUES_QUERY = """ +query Q($after: String) { + repository(name: "langchain", owner: "langchain-ai") { + issues(first: 100, after: $after) { + edges { + cursor + node { + number + author { + login + avatarUrl + url + } + title + createdAt + state + comments(first: 100) { + nodes { + createdAt + author { + login + avatarUrl + url + } + } + } + } + } + } + } +} +""" + +PRS_QUERY = """ +query Q($after: String) { + repository(name: "langchain", owner: "langchain-ai") { + pullRequests(first: 100, after: $after) { + edges { + cursor + node { + number + labels(first: 100) { + nodes { + name + } + } + author { + login + avatarUrl + url + } + title + createdAt + state + comments(first: 100) { + nodes { + createdAt + author { + login + avatarUrl + url + } + } + } + reviews(first:100) { + nodes { + author { + login + avatarUrl + url + } + state + } + } + } + } + } + } +} +""" + + + +class Author(BaseModel): + login: str + avatarUrl: str + url: str + + +# Issues and Discussions + + +class CommentsNode(BaseModel): + createdAt: datetime + author: Union[Author, None] = None + + +class Replies(BaseModel): + nodes: List[CommentsNode] + + +class DiscussionsCommentsNode(CommentsNode): + replies: Replies + + +class Comments(BaseModel): + nodes: List[CommentsNode] + + +class DiscussionsComments(BaseModel): + nodes: List[DiscussionsCommentsNode] + + +class IssuesNode(BaseModel): + number: int + author: Union[Author, None] = None + title: str + createdAt: datetime + state: str + comments: Comments + + +class DiscussionsNode(BaseModel): + number: int + author: Union[Author, None] = None + title: str + createdAt: datetime + comments: DiscussionsComments + + +class IssuesEdge(BaseModel): + cursor: str + node: IssuesNode + + +class DiscussionsEdge(BaseModel): + cursor: str + node: DiscussionsNode + + +class Issues(BaseModel): + edges: List[IssuesEdge] + + +class Discussions(BaseModel): + edges: List[DiscussionsEdge] + + +class IssuesRepository(BaseModel): + issues: Issues + + +class DiscussionsRepository(BaseModel): + discussions: Discussions + + +class IssuesResponseData(BaseModel): + repository: IssuesRepository + + +class DiscussionsResponseData(BaseModel): + repository: DiscussionsRepository + + +class IssuesResponse(BaseModel): + data: IssuesResponseData + + +class DiscussionsResponse(BaseModel): + data: DiscussionsResponseData + + +# PRs + + +class LabelNode(BaseModel): + name: str + + +class Labels(BaseModel): + nodes: List[LabelNode] + + +class ReviewNode(BaseModel): + author: Union[Author, None] = None + state: str + + +class Reviews(BaseModel): + nodes: List[ReviewNode] + + +class PullRequestNode(BaseModel): + number: int + labels: Labels + author: Union[Author, None] = None + title: str + createdAt: datetime + state: str + comments: Comments + reviews: Reviews + + +class PullRequestEdge(BaseModel): + cursor: str + node: PullRequestNode + + +class PullRequests(BaseModel): + edges: List[PullRequestEdge] + + +class PRsRepository(BaseModel): + pullRequests: PullRequests + + +class PRsResponseData(BaseModel): + repository: PRsRepository + + +class PRsResponse(BaseModel): + data: PRsResponseData + + +class Settings(BaseSettings): + input_token: SecretStr + github_repository: str + httpx_timeout: int = 30 + + +def get_graphql_response( + *, + settings: Settings, + query: str, + after: Union[str, None] = None, + category_id: Union[str, None] = None, +) -> Dict[str, Any]: + headers = {"Authorization": f"token {settings.input_token.get_secret_value()}"} + # category_id is only used by one query, but GraphQL allows unused variables, so + # keep it here for simplicity + variables = {"after": after, "category_id": category_id} + response = httpx.post( + GITHUB_GRAPHQL_URL, + headers=headers, + timeout=settings.httpx_timeout, + json={"query": query, "variables": variables, "operationName": "Q"}, + ) + if response.status_code != 200: + logging.error( + f"Response was not 200, after: {after}, category_id: {category_id}" + ) + logging.error(response.text) + raise RuntimeError(response.text) + data = response.json() + if "errors" in data: + logging.error(f"Errors in response, after: {after}, category_id: {category_id}") + logging.error(data["errors"]) + logging.error(response.text) + raise RuntimeError(response.text) + return data + + +def get_graphql_issue_edges(*, settings: Settings, after: Union[str, None] = None): + data = get_graphql_response(settings=settings, query=ISSUES_QUERY, after=after) + graphql_response = IssuesResponse.model_validate(data) + return graphql_response.data.repository.issues.edges + + +def get_graphql_question_discussion_edges( + *, + settings: Settings, + after: Union[str, None] = None, +): + data = get_graphql_response( + settings=settings, + query=DISCUSSIONS_QUERY, + after=after, + category_id=QUESTIONS_CATEGORY_ID, + ) + graphql_response = DiscussionsResponse.model_validate(data) + return graphql_response.data.repository.discussions.edges + + +def get_graphql_pr_edges(*, settings: Settings, after: Union[str, None] = None): + data = get_graphql_response(settings=settings, query=PRS_QUERY, after=after) + graphql_response = PRsResponse.model_validate(data) + return graphql_response.data.repository.pullRequests.edges + + + +def get_issues_experts(settings: Settings): + issue_nodes: List[IssuesNode] = [] + issue_edges = get_graphql_issue_edges(settings=settings) + + while issue_edges: + for edge in issue_edges: + issue_nodes.append(edge.node) + last_edge = issue_edges[-1] + issue_edges = get_graphql_issue_edges(settings=settings, after=last_edge.cursor) + + commentors = Counter() + last_month_commentors = Counter() + authors: Dict[str, Author] = {} + + now = datetime.now(tz=timezone.utc) + one_month_ago = now - timedelta(days=30) + + for issue in issue_nodes: + issue_author_name = None + if issue.author: + authors[issue.author.login] = issue.author + issue_author_name = issue.author.login + issue_commentors = set() + for comment in issue.comments.nodes: + if comment.author: + authors[comment.author.login] = comment.author + if comment.author.login != issue_author_name: + issue_commentors.add(comment.author.login) + for author_name in issue_commentors: + commentors[author_name] += 1 + if issue.createdAt > one_month_ago: + last_month_commentors[author_name] += 1 + + return commentors, last_month_commentors, authors + + +def get_discussions_experts(settings: Settings): + discussion_nodes: List[DiscussionsNode] = [] + discussion_edges = get_graphql_question_discussion_edges(settings=settings) + + while discussion_edges: + for discussion_edge in discussion_edges: + discussion_nodes.append(discussion_edge.node) + last_edge = discussion_edges[-1] + discussion_edges = get_graphql_question_discussion_edges( + settings=settings, after=last_edge.cursor + ) + + commentors = Counter() + last_month_commentors = Counter() + authors: Dict[str, Author] = {} + + now = datetime.now(tz=timezone.utc) + one_month_ago = now - timedelta(days=30) + + for discussion in discussion_nodes: + discussion_author_name = None + if discussion.author: + authors[discussion.author.login] = discussion.author + discussion_author_name = discussion.author.login + discussion_commentors = set() + for comment in discussion.comments.nodes: + if comment.author: + authors[comment.author.login] = comment.author + if comment.author.login != discussion_author_name: + discussion_commentors.add(comment.author.login) + for reply in comment.replies.nodes: + if reply.author: + authors[reply.author.login] = reply.author + if reply.author.login != discussion_author_name: + discussion_commentors.add(reply.author.login) + for author_name in discussion_commentors: + commentors[author_name] += 1 + if discussion.createdAt > one_month_ago: + last_month_commentors[author_name] += 1 + return commentors, last_month_commentors, authors + + +def get_experts(settings: Settings): + # Migrated to only use GitHub Discussions + # ( + # issues_commentors, + # issues_last_month_commentors, + # issues_authors, + # ) = get_issues_experts(settings=settings) + ( + discussions_commentors, + discussions_last_month_commentors, + discussions_authors, + ) = get_discussions_experts(settings=settings) + # commentors = issues_commentors + discussions_commentors + commentors = discussions_commentors + # last_month_commentors = ( + # issues_last_month_commentors + discussions_last_month_commentors + # ) + last_month_commentors = discussions_last_month_commentors + # authors = {**issues_authors, **discussions_authors} + authors = {**discussions_authors} + return commentors, last_month_commentors, authors + + +def get_contributors(settings: Settings): + pr_nodes: List[PullRequestNode] = [] + pr_edges = get_graphql_pr_edges(settings=settings) + + while pr_edges: + for edge in pr_edges: + pr_nodes.append(edge.node) + last_edge = pr_edges[-1] + pr_edges = get_graphql_pr_edges(settings=settings, after=last_edge.cursor) + + contributors = Counter() + commentors = Counter() + reviewers = Counter() + authors: Dict[str, Author] = {} + + for pr in pr_nodes: + author_name = None + if pr.author: + authors[pr.author.login] = pr.author + author_name = pr.author.login + pr_commentors: Set[str] = set() + pr_reviewers: Set[str] = set() + for comment in pr.comments.nodes: + if comment.author: + authors[comment.author.login] = comment.author + if comment.author.login == author_name: + continue + pr_commentors.add(comment.author.login) + for author_name in pr_commentors: + commentors[author_name] += 1 + for review in pr.reviews.nodes: + if review.author: + authors[review.author.login] = review.author + pr_reviewers.add(review.author.login) + for reviewer in pr_reviewers: + reviewers[reviewer] += 1 + if pr.state == "MERGED" and pr.author: + contributors[pr.author.login] += 1 + return contributors, commentors, reviewers, authors + + + +def get_top_users( + *, + counter: Counter, + min_count: int, + authors: Dict[str, Author], + skip_users: Container[str], +): + users = [] + for commentor, count in counter.most_common(50): + if commentor in skip_users: + continue + if count >= min_count: + author = authors[commentor] + users.append( + { + "login": commentor, + "count": count, + "avatarUrl": author.avatarUrl, + "url": author.url, + } + ) + return users + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + settings = Settings() + logging.info(f"Using config: {settings.model_dump_json()}") + g = Github(settings.input_token.get_secret_value()) + repo = g.get_repo(settings.github_repository) + question_commentors, question_last_month_commentors, question_authors = get_experts( + settings=settings + ) + contributors, pr_commentors, reviewers, pr_authors = get_contributors( + settings=settings + ) + authors = {**question_authors, **pr_authors} + maintainers_logins = {} + bot_names = {"codecov", "github-actions", "pre-commit-ci", "dependabot", "dosu"} + maintainers = [] + for login in maintainers_logins: + user = authors[login] + maintainers.append( + { + "login": login, + "answers": question_commentors[login], + "prs": contributors[login], + "avatarUrl": user.avatarUrl, + "url": user.url, + } + ) + + min_count_expert = 10 + min_count_last_month = 3 + min_count_contributor = 4 + min_count_reviewer = 4 + skip_users = maintainers_logins | bot_names + experts = get_top_users( + counter=question_commentors, + min_count=min_count_expert, + authors=authors, + skip_users=skip_users, + ) + last_month_active = get_top_users( + counter=question_last_month_commentors, + min_count=min_count_last_month, + authors=authors, + skip_users=skip_users, + ) + top_contributors = get_top_users( + counter=contributors, + min_count=min_count_contributor, + authors=authors, + skip_users=skip_users, + ) + top_reviewers = get_top_users( + counter=reviewers, + min_count=min_count_reviewer, + authors=authors, + skip_users=skip_users, + ) + + people = { + "maintainers": maintainers, + "experts": experts, + "last_month_active": last_month_active, + "top_contributors": top_contributors, + "top_reviewers": top_reviewers, + } + people_path = Path("./docs/en/data/people.yml") + people_old_content = people_path.read_text(encoding="utf-8") + new_people_content = yaml.dump( + people, sort_keys=False, width=200, allow_unicode=True + ) + if ( + people_old_content == new_people_content + ): + logging.info("The LangChain People data hasn't changed, finishing.") + sys.exit(0) + people_path.write_text(new_people_content, encoding="utf-8") + logging.info("Setting up GitHub Actions git user") + subprocess.run(["git", "config", "user.name", "github-actions"], check=True) + subprocess.run( + ["git", "config", "user.email", "github-actions@github.com"], check=True + ) + branch_name = "fastapi-people" + logging.info(f"Creating a new branch {branch_name}") + subprocess.run(["git", "checkout", "-b", branch_name], check=True) + logging.info("Adding updated file") + subprocess.run( + ["git", "add", str(people_path)], check=True + ) + logging.info("Committing updated file") + message = "👥 Update LangChain People" + result = subprocess.run(["git", "commit", "-m", message], check=True) + logging.info("Pushing branch") + subprocess.run(["git", "push", "origin", branch_name], check=True) + logging.info("Creating PR") + pr = repo.create_pull(title=message, body=message, base="master", head=branch_name) + logging.info(f"Created PR: {pr.number}") + logging.info("Finished") From f4e0de634a0529a648eb0c80d0c5b6ad346885b0 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Mon, 19 Feb 2024 10:43:21 -0500 Subject: [PATCH 2/3] x --- .github/actions/people/app/main.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/actions/people/app/main.py b/.github/actions/people/app/main.py index a9d35faeccac1..dcd6cb76b84f1 100644 --- a/.github/actions/people/app/main.py +++ b/.github/actions/people/app/main.py @@ -8,7 +8,7 @@ from collections import Counter from datetime import datetime, timedelta, timezone from pathlib import Path -from typing import Any, Container, Dict, List, Set, Union +from typing import Any, Container, Dict, List, Set, Union, Tuple import httpx import yaml @@ -391,7 +391,7 @@ def get_issues_experts(settings: Settings): return commentors, last_month_commentors, authors -def get_discussions_experts(settings: Settings): +def get_discussions_experts(settings: Settings) -> Tuple[Counter, Counter, Dict[str, Author]]: discussion_nodes: List[DiscussionsNode] = [] discussion_edges = get_graphql_question_discussion_edges(settings=settings) @@ -433,7 +433,7 @@ def get_discussions_experts(settings: Settings): return commentors, last_month_commentors, authors -def get_experts(settings: Settings): +def get_experts(settings: Settings) -> Tuple[Counter, Counter, Dict[str, Author]]: # Migrated to only use GitHub Discussions # ( # issues_commentors, @@ -528,9 +528,15 @@ def get_top_users( logging.info(f"Using config: {settings.model_dump_json()}") g = Github(settings.input_token.get_secret_value()) repo = g.get_repo(settings.github_repository) - question_commentors, question_last_month_commentors, question_authors = get_experts( - settings=settings - ) + # Need to look up discussion category ID, and then can uncomment + # question_commentors, question_last_month_commentors, question_authors = get_experts( + # settings=settings + # ) + # Placeholder experts for now + question_commentors = Counter() + question_last_month_commentors = Counter() + question_authors = {} + contributors, pr_commentors, reviewers, pr_authors = get_contributors( settings=settings ) From bce9db786224c4dc3c7f4ae08c28ea8aa26de9f8 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Mon, 19 Feb 2024 10:56:33 -0500 Subject: [PATCH 3/3] x --- .github/workflows/people.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/people.yml diff --git a/.github/workflows/people.yml b/.github/workflows/people.yml new file mode 100644 index 0000000000000..204236a2ba0f7 --- /dev/null +++ b/.github/workflows/people.yml @@ -0,0 +1,23 @@ +name: LangChain People + +on: +# schedule: +# - cron: "0 14 1 * *" + workflow_dispatch: + +jobs: + langchain-people: + if: github.repository_owner == 'langchain-ai' + runs-on: ubuntu-latest + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: echo "$GITHUB_CONTEXT" + - uses: actions/checkout@v4 + # Ref: https://github.com/actions/runner/issues/2033 + - name: Fix git safe.directory in container + run: mkdir -p /home/runner/work/_temp/_github_home && printf "[safe]\n\tdirectory = /github/workspace" > /home/runner/work/_temp/_github_home/.gitconfig + - uses: ./.github/actions/people + with: + token: ${{ secrets.LANGCHAIN_PEOPLE }} \ No newline at end of file