From 162cb5eafcb4d82d28f8ef301ddea3434bae5cc1 Mon Sep 17 00:00:00 2001 From: Samuel Aktar Laskar Date: Mon, 1 Apr 2024 14:57:27 +0530 Subject: [PATCH] Updated the scraper to find linked issue to PR --- scraper/src/github.py | 17 +++-- scraper/src/linked_issue_parser.py | 113 +++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 scraper/src/linked_issue_parser.py diff --git a/scraper/src/github.py b/scraper/src/github.py index 20857837..94f433d0 100755 --- a/scraper/src/github.py +++ b/scraper/src/github.py @@ -9,7 +9,7 @@ from pathlib import Path from urllib.parse import parse_qsl, urlparse from zoneinfo import ZoneInfo - +from linked_issue_parser import LinkedIssueParser import requests logging.basicConfig( @@ -124,8 +124,6 @@ def parse_event(self, event, event_time): ) elif event["type"] == "PullRequestEvent": - pr_body = event["payload"]["pull_request"]["body"] - no_of_linked_issues = self.parse_linked_issues(pr_body) if event["payload"]["action"] == "opened": self.append( user, @@ -134,8 +132,7 @@ def parse_event(self, event, event_time): "title": f'{event["repo"]["name"]}#{event["payload"]["pull_request"]["number"]}', "time": event_time, "link": event["payload"]["pull_request"]["html_url"], - "text": event["payload"]["pull_request"]["title"], - "no_of_linked_issues" : no_of_linked_issues + "text": event["payload"]["pull_request"]["title"] }, ) @@ -144,6 +141,15 @@ def parse_event(self, event, event_time): and event["payload"]["pull_request"]["merged"] ): turnaround_time = self.caclculate_turnaround_time(event) + pr_body = event["payload"]["pull_request"]["body"] + repo = event["repo"]["name"] + parts = repo.split('/') + org_name = parts[0] + repo_name = parts[1] + pr_no = event['payload']['pull_request']['number'] + linked_issue_parser = LinkedIssueParser(org=org_name,repo=repo_name,pr_no=pr_no,pr_body=pr_body) + linked_issues = linked_issue_parser.parse_linked_issues() + self.log.debug(f'linked_issues for pr {pr_no} are {linked_issues}') self.append( event["payload"]["pull_request"]["user"]["login"], { @@ -153,6 +159,7 @@ def parse_event(self, event, event_time): "link": event["payload"]["pull_request"]["html_url"], "text": event["payload"]["pull_request"]["title"], "turnaround_time": turnaround_time, + "linked_issues" : linked_issues }, ) diff --git a/scraper/src/linked_issue_parser.py b/scraper/src/linked_issue_parser.py new file mode 100644 index 00000000..56b34036 --- /dev/null +++ b/scraper/src/linked_issue_parser.py @@ -0,0 +1,113 @@ +import re +from os import getenv +import requests + +class LinkedIssueParser: + def __init__(self,org:str, repo:str, pr_no:int, pr_body: str): + self.repo = repo + self.pr_no = pr_no + self.pr_body = pr_body + self.org = org + + # The logic here is as follows: + # Get a list of all events on a Pull Request of the type CONNECTED_EVENT and DISCONNECTED_EVENT + # Create a map, keyed by Issue number and keep a count of how may times the issue is CONNECTED and DISCONNECTED + # From that map, look for keys that have an odd-numbered count, as these are the events that have been CONNECTED that don't have a corresponding DISCONNECTED event. + + def parse_ui_linked_issues(self): + query = """ + {{ + resource(url: "https://github.com/{org}/{repo}/pull/{pr_no}") {{ + ... on PullRequest {{ + timelineItems(itemTypes: [CONNECTED_EVENT, DISCONNECTED_EVENT], first: 100) {{ + nodes {{ + ... on ConnectedEvent {{ + id + subject {{ + ... on Issue {{ + number + }} + }} + }} + ... on DisconnectedEvent {{ + id + subject {{ + ... on Issue {{ + number + }} + }} + }} + }} + }} + }} + }} + }} + """.format(org = self.org, repo = self.repo, pr_no = self.pr_no) + gh_url = 'https://api.github.com/graphql' + token = getenv('GITHUB_TOKEN') + headers = { + 'Authorization': f'Bearer {token}', + 'Content-Type':'application/json' + } + response = requests.post(gh_url,headers=headers,json={'query':query}) + if response.status_code != 200: + return [] + data = response.json() + issues = {} + for node in data['data']['resource']['timelineItems']['nodes']: + issue_number = node['subject']['number'] + if issue_number in issues: + issues[issue_number] +=1 + else: + issues[issue_number] = 1 + + linked_issues = [] + for issue, count in issues.items(): + if count % 2 != 0: + linked_issues.append(f'https://github.com/{self.org}/{self.repo}/issues/{issue}') + return linked_issues + + def get_concat_commit_messages(self): + commit_url = f'https://api.github.com/repos/{self.org}/{self.repo}/pulls/{self.pr_no}/commits' + resposne = requests.get(commit_url) + if resposne.status_code != 200: + return "" + json_data = resposne.json() + result = "" + for commit in json_data: + message = commit['commit']['message'] + result = f'{result} , {message}' + return result + + def parse_desc_linked_issues(self): + pattern_same_repo = r'\b(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?)\s+#(\d+)' + pattern_other_repo = r'\b(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?)\s+(\S+\/\S+)#(\d+)' + commit_messages = self.get_concat_commit_messages() + text = f'{self.pr_body} {commit_messages}' + same_repo_linked_issues = re.findall(pattern_same_repo,text,re.IGNORECASE) + other_repo_linked_issues = re.findall(pattern_other_repo,text,re.IGNORECASE) + linked_issues = set([]) + for issue in same_repo_linked_issues: + linked_issues.add(issue) + for issue in other_repo_linked_issues: + linked_issues.add(issue) + linked_issues_url = [] + for issue in linked_issues: + if isinstance(issue, str): + linked_issues_url.append(f'https://github.com/{self.org}/{self.repo}/issues/{issue}') + elif isinstance(issue, tuple): + linked_issues_url.append(f'https://github.com/{issue[0]}/issues/{issue[1]}') + continue + return linked_issues_url + + + def parse_linked_issues(self): + linked_issues = [] + ui_linked_issues = self.parse_ui_linked_issues() + desc_linked_issues = self.parse_desc_linked_issues() + for issue in ui_linked_issues: + linked_issues.append(issue) + for issue in desc_linked_issues: + linked_issues.append(issue) + return linked_issues + \ No newline at end of file