Skip to content

Commit

Permalink
Updated the scraper to find linked issue to PR
Browse files Browse the repository at this point in the history
  • Loading branch information
Samuel-Aktar-Laskar committed Apr 1, 2024
1 parent e612027 commit 162cb5e
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 5 deletions.
17 changes: 12 additions & 5 deletions scraper/src/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pathlib import Path
from urllib.parse import parse_qsl, urlparse
from zoneinfo import ZoneInfo

from linked_issue_parser import LinkedIssueParser
import requests

logging.basicConfig(
Expand Down Expand Up @@ -124,8 +124,6 @@ def parse_event(self, event, event_time):
)

elif event["type"] == "PullRequestEvent":
pr_body = event["payload"]["pull_request"]["body"]
no_of_linked_issues = self.parse_linked_issues(pr_body)
if event["payload"]["action"] == "opened":
self.append(
user,
Expand All @@ -134,8 +132,7 @@ def parse_event(self, event, event_time):
"title": f'{event["repo"]["name"]}#{event["payload"]["pull_request"]["number"]}',
"time": event_time,
"link": event["payload"]["pull_request"]["html_url"],
"text": event["payload"]["pull_request"]["title"],
"no_of_linked_issues" : no_of_linked_issues
"text": event["payload"]["pull_request"]["title"]
},
)

Expand All @@ -144,6 +141,15 @@ def parse_event(self, event, event_time):
and event["payload"]["pull_request"]["merged"]
):
turnaround_time = self.caclculate_turnaround_time(event)
pr_body = event["payload"]["pull_request"]["body"]
repo = event["repo"]["name"]
parts = repo.split('/')
org_name = parts[0]
repo_name = parts[1]
pr_no = event['payload']['pull_request']['number']
linked_issue_parser = LinkedIssueParser(org=org_name,repo=repo_name,pr_no=pr_no,pr_body=pr_body)
linked_issues = linked_issue_parser.parse_linked_issues()
self.log.debug(f'linked_issues for pr {pr_no} are {linked_issues}')
self.append(
event["payload"]["pull_request"]["user"]["login"],
{
Expand All @@ -153,6 +159,7 @@ def parse_event(self, event, event_time):
"link": event["payload"]["pull_request"]["html_url"],
"text": event["payload"]["pull_request"]["title"],
"turnaround_time": turnaround_time,
"linked_issues" : linked_issues
},
)

Expand Down
113 changes: 113 additions & 0 deletions scraper/src/linked_issue_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import re
from os import getenv
import requests

class LinkedIssueParser:
def __init__(self,org:str, repo:str, pr_no:int, pr_body: str):
self.repo = repo
self.pr_no = pr_no
self.pr_body = pr_body
self.org = org

# The logic here is as follows:
# Get a list of all events on a Pull Request of the type CONNECTED_EVENT and DISCONNECTED_EVENT
# Create a map, keyed by Issue number and keep a count of how may times the issue is CONNECTED and DISCONNECTED
# From that map, look for keys that have an odd-numbered count, as these are the events that have been CONNECTED that don't have a corresponding DISCONNECTED event.

def parse_ui_linked_issues(self):
query = """
{{
resource(url: "https://github.com/{org}/{repo}/pull/{pr_no}") {{
... on PullRequest {{
timelineItems(itemTypes: [CONNECTED_EVENT, DISCONNECTED_EVENT], first: 100) {{
nodes {{
... on ConnectedEvent {{
id
subject {{
... on Issue {{
number
}}
}}
}}
... on DisconnectedEvent {{
id
subject {{
... on Issue {{
number
}}
}}
}}
}}
}}
}}
}}
}}
""".format(org = self.org, repo = self.repo, pr_no = self.pr_no)
gh_url = 'https://api.github.com/graphql'
token = getenv('GITHUB_TOKEN')
headers = {
'Authorization': f'Bearer {token}',
'Content-Type':'application/json'
}
response = requests.post(gh_url,headers=headers,json={'query':query})
if response.status_code != 200:
return []
data = response.json()
issues = {}
for node in data['data']['resource']['timelineItems']['nodes']:
issue_number = node['subject']['number']
if issue_number in issues:
issues[issue_number] +=1
else:
issues[issue_number] = 1

linked_issues = []
for issue, count in issues.items():
if count % 2 != 0:
linked_issues.append(f'https://github.com/{self.org}/{self.repo}/issues/{issue}')
return linked_issues

def get_concat_commit_messages(self):
commit_url = f'https://api.github.com/repos/{self.org}/{self.repo}/pulls/{self.pr_no}/commits'
resposne = requests.get(commit_url)
if resposne.status_code != 200:
return ""
json_data = resposne.json()
result = ""
for commit in json_data:
message = commit['commit']['message']
result = f'{result} , {message}'
return result

def parse_desc_linked_issues(self):
pattern_same_repo = r'\b(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?)\s+#(\d+)'
pattern_other_repo = r'\b(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?)\s+(\S+\/\S+)#(\d+)'
commit_messages = self.get_concat_commit_messages()
text = f'{self.pr_body} {commit_messages}'
same_repo_linked_issues = re.findall(pattern_same_repo,text,re.IGNORECASE)
other_repo_linked_issues = re.findall(pattern_other_repo,text,re.IGNORECASE)
linked_issues = set([])
for issue in same_repo_linked_issues:
linked_issues.add(issue)
for issue in other_repo_linked_issues:
linked_issues.add(issue)
linked_issues_url = []
for issue in linked_issues:
if isinstance(issue, str):
linked_issues_url.append(f'https://github.com/{self.org}/{self.repo}/issues/{issue}')
elif isinstance(issue, tuple):
linked_issues_url.append(f'https://github.com/{issue[0]}/issues/{issue[1]}')
continue
return linked_issues_url


def parse_linked_issues(self):
linked_issues = []
ui_linked_issues = self.parse_ui_linked_issues()
desc_linked_issues = self.parse_desc_linked_issues()
for issue in ui_linked_issues:
linked_issues.append(issue)
for issue in desc_linked_issues:
linked_issues.append(issue)
return linked_issues

0 comments on commit 162cb5e

Please sign in to comment.