From 4eef3e91905030ea843a64d9b54ba09980eb1f58 Mon Sep 17 00:00:00 2001 From: mrT23 Date: Thu, 10 Oct 2024 08:48:37 +0300 Subject: [PATCH 1/3] feat: add ticket compliance check and rate limit validation - Implement ticket compliance check logic in `utils.py` and `ticket_pr_compliance_check.py` - Add functions to extract and cache PR tickets, and check ticket relevancy - Introduce rate limit validation for GitHub API requests - Update `pr_reviewer_prompts.toml` and `pr_description_prompts.toml` to include ticket compliance fields - Modify configuration to require ticket analysis review --- pr_agent/algo/utils.py | 190 +++++++++++++----- pr_agent/settings/configuration.toml | 4 +- pr_agent/settings/pr_description_prompts.toml | 25 ++- pr_agent/settings/pr_reviewer_prompts.toml | 54 ++++- pr_agent/tools/ticket_pr_compliance_check.py | 113 +++++++++++ 5 files changed, 326 insertions(+), 60 deletions(-) create mode 100644 pr_agent/tools/ticket_pr_compliance_check.py diff --git a/pr_agent/algo/utils.py b/pr_agent/algo/utils.py index bd0781ae7..98c69d007 100644 --- a/pr_agent/algo/utils.py +++ b/pr_agent/algo/utils.py @@ -1,18 +1,22 @@ from __future__ import annotations -import html2text -import html import copy import difflib +import hashlib +import html import json import os import re import textwrap import time +import traceback from datetime import datetime from enum import Enum from typing import Any, List, Tuple + +import html2text +import requests import yaml from pydantic import BaseModel from starlette_context import context @@ -110,6 +114,7 @@ def convert_to_markdown_v2(output_data: dict, "Insights from user's answers": "📝", "Code feedback": "🤖", "Estimated effort to review [1-5]": "⏱️", + "Ticket compliance check": "🎫", } markdown_text = "" if not incremental_review: @@ -165,6 +170,8 @@ def convert_to_markdown_v2(output_data: dict, markdown_text += f'### {emoji} No relevant tests\n\n' else: markdown_text += f"### PR contains tests\n\n" + elif 'ticket compliance check' in key_nice.lower(): + markdown_text = ticket_markdown_logic(emoji, markdown_text, value, gfm_supported) elif 'security concerns' in key_nice.lower(): if gfm_supported: markdown_text += f"" @@ -254,6 +261,52 @@ def convert_to_markdown_v2(output_data: dict, return markdown_text +def ticket_markdown_logic(emoji, markdown_text, value, gfm_supported) -> str: + ticket_compliance_str = "" + final_compliance_level = -1 + if isinstance(value, list): + for v in value: + ticket_url = v.get('ticket_url', '').strip() + compliance_level = v.get('overall_compliance_level', '').strip() + # add emojis, if 'Fully compliant' ✅, 'Partially compliant' 🔶, or 'Not compliant' ❌ + if compliance_level.lower() == 'fully compliant': + # compliance_level = '✅ Fully compliant' + final_compliance_level = 2 if final_compliance_level == -1 else 1 + elif compliance_level.lower() == 'partially compliant': + # compliance_level = '🔶 Partially compliant' + final_compliance_level = 1 + elif compliance_level.lower() == 'not compliant': + # compliance_level = '❌ Not compliant' + final_compliance_level = 0 if final_compliance_level < 1 else 1 + + # explanation = v.get('compliance_analysis', '').strip() + explanation = '' + fully_compliant_str = v.get('fully_compliant_requirements', '').strip() + not_compliant_str = v.get('not_compliant_requirements', '').strip() + if fully_compliant_str: + explanation += f"Fully compliant requirements:\n{fully_compliant_str}\n\n" + if not_compliant_str: + explanation += f"Not compliant requirements:\n{not_compliant_str}\n\n" + + ticket_compliance_str += f"\n\n**[{ticket_url.split('/')[-1]}]({ticket_url}) - {compliance_level}**\n\n{explanation}\n\n" + if final_compliance_level == 2: + compliance_level = '✅' + elif final_compliance_level == 1: + compliance_level = '🔶' + else: + compliance_level = '❌' + + if gfm_supported: + markdown_text += f"\n\n" + markdown_text += f"**{emoji} Ticket compliance analysis {compliance_level}**\n\n" + markdown_text += ticket_compliance_str + markdown_text += f"\n" + else: + markdown_text += f"### {emoji} Ticket compliance analysis {compliance_level}\n\n" + markdown_text += ticket_compliance_str+"\n\n" + return markdown_text + + def process_can_be_split(emoji, value): try: # key_nice = "Can this PR be split?" @@ -554,7 +607,8 @@ def load_yaml(response_text: str, keys_fix_yaml: List[str] = [], first_key="", l get_logger().warning(f"Initial failure to parse AI prediction: {e}") data = try_fix_yaml(response_text, keys_fix_yaml=keys_fix_yaml, first_key=first_key, last_key=last_key) if not data: - get_logger().error(f"Failed to parse AI prediction after fallbacks", artifact={'response_text': response_text}) + get_logger().error(f"Failed to parse AI prediction after fallbacks", + artifact={'response_text': response_text}) else: get_logger().info(f"Successfully parsed AI prediction after fallbacks", artifact={'response_text': response_text}) @@ -841,56 +895,64 @@ def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo], break return position, absolute_position -def validate_and_await_rate_limit(rate_limit_status=None, git_provider=None, get_rate_limit_status_func=None): - if git_provider and not rate_limit_status: - rate_limit_status = {'resources': git_provider.github_client.get_rate_limit().raw_data} +def get_rate_limit_status(github_token) -> dict: + GITHUB_API_URL = get_settings(use_context=False).get("GITHUB.BASE_URL", "https://api.github.com").rstrip("/") # "https://api.github.com" + # GITHUB_API_URL = "https://api.github.com" + RATE_LIMIT_URL = f"{GITHUB_API_URL}/rate_limit" + HEADERS = { + "Accept": "application/vnd.github.v3+json", + "Authorization": f"token {github_token}" + } - if not rate_limit_status: - rate_limit_status = get_rate_limit_status_func() + response = requests.get(RATE_LIMIT_URL, headers=HEADERS) + try: + rate_limit_info = response.json() + if rate_limit_info.get('message') == 'Rate limiting is not enabled.': # for github enterprise + return {'resources': {}} + response.raise_for_status() # Check for HTTP errors + except: # retry + time.sleep(0.1) + response = requests.get(RATE_LIMIT_URL, headers=HEADERS) + return response.json() + return rate_limit_info + + +def validate_rate_limit_github(github_token, installation_id=None, threshold=0.1) -> bool: + try: + rate_limit_status = get_rate_limit_status(github_token) + if installation_id: + get_logger().debug(f"installation_id: {installation_id}, Rate limit status: {rate_limit_status['rate']}") # validate that the rate limit is not exceeded - is_rate_limit = False - for key, value in rate_limit_status['resources'].items(): - if value['remaining'] == 0: - print(f"key: {key}, value: {value}") - is_rate_limit = True - sleep_time_sec = value['reset'] - datetime.now().timestamp() - sleep_time_hour = sleep_time_sec / 3600.0 - print(f"Rate limit exceeded. Sleeping for {sleep_time_hour} hours") - if sleep_time_sec > 0: - time.sleep(sleep_time_sec+1) - - if git_provider: - rate_limit_status = {'resources': git_provider.github_client.get_rate_limit().raw_data} - else: - rate_limit_status = get_rate_limit_status_func() - - return is_rate_limit - - -def get_largest_component(pr_url): - from pr_agent.tools.pr_analyzer import PRAnalyzer - publish_output = get_settings().config.publish_output - get_settings().config.publish_output = False # disable publish output - analyzer = PRAnalyzer(pr_url) - methods_dict_files = analyzer.run_sync() - get_settings().config.publish_output = publish_output - max_lines_changed = 0 - file_b = "" - component_name_b = "" - for file in methods_dict_files: - for method in methods_dict_files[file]: - try: - if methods_dict_files[file][method]['num_plus_lines'] > max_lines_changed: - max_lines_changed = methods_dict_files[file][method]['num_plus_lines'] - file_b = file - component_name_b = method - except: - pass - if component_name_b: - get_logger().info(f"Using the largest changed component: '{component_name_b}'") - return component_name_b, file_b - else: - return None, None + # validate that the rate limit is not exceeded + for key, value in rate_limit_status['resources'].items(): + if value['remaining'] < value['limit'] * threshold: + get_logger().error(f"key: {key}, value: {value}") + return False + return True + except Exception as e: + get_logger().error(f"Error in rate limit {e}", + artifact={"traceback": traceback.format_exc()}) + return True + + +def validate_and_await_rate_limit(github_token): + try: + rate_limit_status = get_rate_limit_status(github_token) + # validate that the rate limit is not exceeded + for key, value in rate_limit_status['resources'].items(): + if value['remaining'] < value['limit'] // 80: + get_logger().error(f"key: {key}, value: {value}") + sleep_time_sec = value['reset'] - datetime.now().timestamp() + sleep_time_hour = sleep_time_sec / 3600.0 + get_logger().error(f"Rate limit exceeded. Sleeping for {sleep_time_hour} hours") + if sleep_time_sec > 0: + time.sleep(sleep_time_sec + 1) + rate_limit_status = get_rate_limit_status(github_token) + return rate_limit_status + except: + get_logger().error("Error in rate limit") + return None + def github_action_output(output_data: dict, key_name: str): try: @@ -906,7 +968,7 @@ def github_action_output(output_data: dict, key_name: str): def show_relevant_configurations(relevant_section: str) -> str: - skip_keys = ['ai_disclaimer', 'ai_disclaimer_title', 'ANALYTICS_FOLDER', 'secret_provider', "skip_keys", + skip_keys = ['ai_disclaimer', 'ai_disclaimer_title', 'ANALYTICS_FOLDER', 'secret_provider', "skip_keys", "app_id", "redirect", 'trial_prefix_message', 'no_eligible_message', 'identity_provider', 'ALLOWED_REPOS','APP_NAME'] extra_skip_keys = get_settings().config.get('config.skip_keys', []) if extra_skip_keys: @@ -939,6 +1001,25 @@ def is_value_no(value): return False +def set_pr_string(repo_name, pr_number): + return f"{repo_name}#{pr_number}" + + +def string_to_uniform_number(s: str) -> float: + """ + Convert a string to a uniform number in the range [0, 1]. + The uniform distribution is achieved by the nature of the SHA-256 hash function, which produces a uniformly distributed hash value over its output space. + """ + # Generate a hash of the string + hash_object = hashlib.sha256(s.encode()) + # Convert the hash to an integer + hash_int = int(hash_object.hexdigest(), 16) + # Normalize the integer to the range [0, 1] + max_hash_int = 2 ** 256 - 1 + uniform_number = float(hash_int) / max_hash_int + return uniform_number + + def process_description(description_full: str) -> Tuple[str, List]: if not description_full: return "", [] @@ -998,7 +1079,10 @@ def process_description(description_full: str) -> Tuple[str, List]: 'long_summary': long_summary }) else: - get_logger().error(f"Failed to parse description", artifact={'description': file_data}) + if '...' in file_data: + pass # PR with many files. some did not get analyzed + else: + get_logger().error(f"Failed to parse description", artifact={'description': file_data}) except Exception as e: get_logger().exception(f"Failed to process description: {e}", artifact={'description': file_data}) diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index add06bb6b..d4ecf3794 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -51,9 +51,7 @@ require_tests_review=true require_estimate_effort_to_review=true require_can_be_split_review=false require_security_review=true -# soc2 -require_soc2_ticket=false -soc2_ticket_prompt="Does the PR description include a link to ticket in a project management system (e.g., Jira, Asana, Trello, etc.) ?" +require_ticket_analysis_review=true # general options num_code_suggestions=0 inline_code_comments = false diff --git a/pr_agent/settings/pr_description_prompts.toml b/pr_agent/settings/pr_description_prompts.toml index de7c3d548..364dd9af7 100644 --- a/pr_agent/settings/pr_description_prompts.toml +++ b/pr_agent/settings/pr_description_prompts.toml @@ -78,9 +78,9 @@ pr_files: ... ... {%- endif %} -description: |- +description: | ... -title: |- +title: | ... {%- if enable_custom_labels %} labels: @@ -94,7 +94,26 @@ labels: Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|') """ -user="""PR Info: +user=""" +{%- if related_tickets %} +Related Ticket Info: +{% for ticket in related_tickets %} +===== +Ticket Title: '{{ ticket.title }}' +{%- if ticket.labels %} +Ticket Labels: {{ ticket.labels }} +{%- endif %} +{%- if ticket.body %} +Ticket Description: +##### +{{ ticket.body }} +##### +{%- endif %} +===== +{% endfor %} +{%- endif %} + +PR Info: Previous title: '{{title}}' diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml index e3b4bfe48..0b61e8ea5 100644 --- a/pr_agent/settings/pr_reviewer_prompts.toml +++ b/pr_agent/settings/pr_reviewer_prompts.toml @@ -85,7 +85,20 @@ class KeyIssuesComponentLink(BaseModel): start_line: int = Field(description="The start line that corresponds to this issue in the relevant file") end_line: int = Field(description="The end line that corresponds to this issue in the relevant file") +{%- if related_tickets %} + +class TicketCompliance(BaseModel): + ticket_url: str = Field(description="Ticket URL or ID") + ticket_requirements: str = Field(description="Repeat, in your own words, all ticket requirements, in bullet points") + fully_compliant_requirements: str = Field(description="A list, in bullet points, of which requirements are met by the PR code. Don't explain how the requirements are met, just list them shortly. Can be empty") + not_compliant_requirements: str = Field(description="A list, in bullet points, of which requirements are not met by the PR code. Don't explain how the requirements are not met, just list them shortly. Can be empty") + overall_compliance_level: str = Field(description="Overall give this PR one of these three values in relation to the ticket: 'Fully compliant', 'Partially compliant', or 'Not compliant'") +{%- endif %} + class Review(BaseModel): +{%- if related_tickets %} + ticket_compliance_check: List[TicketCompliance] = Field(description="A list of compliance checks for the related tickets") +{%- endif %} {%- if require_estimate_effort_to_review %} estimated_effort_to_review_[1-5]: int = Field(description="Estimate, on a scale of 1-5 (inclusive), the time and effort required to review this PR by an experienced and knowledgeable developer. 1 means short and easy review , 5 means long and hard review. Take into account the size, complexity, quality, and the needed changes of the PR code diff.") {%- endif %} @@ -130,6 +143,19 @@ class PRReview(BaseModel): Example output: ```yaml review: +{%- if related_tickets %} + ticket_compliance_check: + - ticket_url: | + ... + ticket_requirements: | + ... + fully_compliant_requirements: | + ... + not_compliant_requirements: | + ... + overall_compliance_level: | + ... +{%- endif %} {%- if require_estimate_effort_to_review %} estimated_effort_to_review_[1-5]: | 3 @@ -176,7 +202,33 @@ code_feedback: Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|') """ -user="""--PR Info-- +user=""" +{%- if related_tickets %} +--PR Ticket Info-- +{%- for ticket in related_tickets %} +===== +Ticket URL: '{{ ticket.ticket_url }}' + +Ticket Title: '{{ ticket.title }}' + +{%- if ticket.labels %} + +Ticket Labels: {{ ticket.labels }} + +{%- endif %} +{%- if ticket.body %} + +Ticket Description: +##### +{{ ticket.body }} +##### +{%- endif %} +===== +{% endfor %} +{%- endif %} + + +--PR Info-- Title: '{{title}}' diff --git a/pr_agent/tools/ticket_pr_compliance_check.py b/pr_agent/tools/ticket_pr_compliance_check.py new file mode 100644 index 000000000..03fdc88b5 --- /dev/null +++ b/pr_agent/tools/ticket_pr_compliance_check.py @@ -0,0 +1,113 @@ +import re +import traceback + +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import GithubProvider +from pr_agent.log import get_logger + + +def find_jira_tickets(text): + # Regular expression patterns for JIRA tickets + patterns = [ + r'\b[A-Z]{2,10}-\d{1,7}\b', # Standard JIRA ticket format (e.g., PROJ-123) + r'(?:https?://[^\s/]+/browse/)?([A-Z]{2,10}-\d{1,7})\b' # JIRA URL or just the ticket + ] + + tickets = set() + for pattern in patterns: + matches = re.findall(pattern, text) + for match in matches: + if isinstance(match, tuple): + # If it's a tuple (from the URL pattern), take the last non-empty group + ticket = next((m for m in reversed(match) if m), None) + else: + ticket = match + if ticket: + tickets.add(ticket) + + return list(tickets) + + +def extract_ticket_links_from_pr_description(pr_description, repo_path): + """ + Extract all ticket links from PR description + """ + + # example link to search for: https://github.com/Codium-ai/pr-agent-pro/issues/525 + pattern = r'https://github[^/]+/[^/]+/[^/]+/issues/\d+' # should support also github server (for example 'https://github.company.ai/Codium-ai/pr-agent-pro/issues/525') + + # Find all matches in the text + github_tickets = re.findall(pattern, pr_description) + + # Find all issues referenced like #123 and add them as https://github.com/{repo_path}/issues/{issue_number} + # (unneeded, since when you pull the actual comment, it appears as a full link) + # issue_number_pattern = r'#\d+' + # issue_numbers = re.findall(issue_number_pattern, pr_description) + # for issue_number in issue_numbers: + # issue_number = issue_number[1:] # remove # + # # check if issue_number is a valid number and len(issue_number) < 5 + # if issue_number.isdigit() and len(issue_number) < 5: + # github_tickets.append(f'https://github.com/{repo_path}/issues/{issue_number}') + + return github_tickets + + +async def extract_tickets(git_provider): + MAX_TICKET_CHARACTERS = 10000 + try: + if isinstance(git_provider, GithubProvider): + user_description = git_provider.get_user_description() + tickets = extract_ticket_links_from_pr_description(user_description, git_provider.repo) + tickets_content = [] + if tickets: + for ticket in tickets: + # extract ticket number and repo name + repo_name, original_issue_number = git_provider._parse_issue_url(ticket) + + # get the ticket object + issue_main = git_provider.repo_obj.get_issue(original_issue_number) + + # clip issue_main.body max length + issue_body = issue_main.body + if len(issue_main.body) > MAX_TICKET_CHARACTERS: + issue_body = issue_main.body[:MAX_TICKET_CHARACTERS] + "..." + + # extract labels + labels = [] + try: + for label in issue_main.labels: + if isinstance(label, str): + labels.append(label) + else: + labels.append(label.name) + except Exception as e: + get_logger().error(f"Error extracting labels error= {e}", + artifact={"traceback": traceback.format_exc()}) + tickets_content.append( + {'ticket_id': issue_main.number, + 'ticket_url': ticket, 'title': issue_main.title, 'body': issue_body, + 'labels': ", ".join(labels)}) + return tickets_content + + except Exception as e: + get_logger().error(f"Error extracting tickets error= {e}", + artifact={"traceback": traceback.format_exc()}) + + +async def extract_and_cache_pr_tickets(git_provider, vars): + if get_settings().get('config.require_ticket_analysis_review', False): + return + related_tickets = get_settings().get('related_tickets', []) + if not related_tickets: + tickets_content = await extract_tickets(git_provider) + if tickets_content: + get_logger().info("Extracted tickets from PR description", artifact={"tickets": tickets_content}) + vars['related_tickets'] = tickets_content + get_settings().set('related_tickets', tickets_content) + else: # if tickets are already cached + get_logger().info("Using cached tickets", artifact={"tickets": related_tickets}) + vars['related_tickets'] = related_tickets + + +def check_tickets_relevancy(): + return True From 7db9a038055e37f10aadfaa4cd53a276a9b4b4a2 Mon Sep 17 00:00:00 2001 From: mrT23 Date: Thu, 10 Oct 2024 08:53:07 +0300 Subject: [PATCH 2/3] feat: integrate ticket extraction and enhance YAML handling in PR tools - Add ticket extraction and caching functionality in `pr_description.py` and `pr_reviewer.py`. - Introduce `keys_fix` parameter to improve YAML loading robustness. - Enhance error handling for estimated effort parsing in `pr_reviewer.py`. --- pr_agent/tools/pr_description.py | 21 ++++++++++++++------- pr_agent/tools/pr_reviewer.py | 21 ++++++++++++++++++--- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py index 9f7d79d31..c965d84e7 100644 --- a/pr_agent/tools/pr_description.py +++ b/pr_agent/tools/pr_description.py @@ -20,6 +20,8 @@ from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage +from pr_agent.tools.ticket_pr_compliance_check import extract_ticket_links_from_pr_description, extract_tickets, \ + extract_and_cache_pr_tickets class PRDescription: @@ -38,6 +40,7 @@ def __init__(self, pr_url: str, args: list = None, self.git_provider.get_languages(), self.git_provider.get_files() ) self.pr_id = self.git_provider.get_pr_id() + self.keys_fix = ["filename:", "language:", "changes_summary:", "changes_title:", "description:", "title:"] if get_settings().pr_description.enable_semantic_files_types and not self.git_provider.is_supported( "gfm_markdown"): @@ -60,6 +63,7 @@ def __init__(self, pr_url: str, args: list = None, "enable_custom_labels": get_settings().config.enable_custom_labels, "custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function "enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types, + "related_tickets": "", } self.user_description = self.git_provider.get_user_description() @@ -87,6 +91,9 @@ async def run(self): if get_settings().config.publish_output and not get_settings().config.get('is_auto_command', False): self.git_provider.publish_comment("Preparing PR description...", is_temporary=True) + # ticket extraction if exists + await extract_and_cache_pr_tickets(self.git_provider, self.vars) + await retry_with_fallback_models(self._prepare_prediction, ModelType.TURBO) if self.prediction: @@ -226,7 +233,7 @@ async def _prepare_prediction(self, model: str) -> None: file_description_str_list = [] for i, result in enumerate(results): prediction_files = result.strip().removeprefix('```yaml').strip('`').strip() - if load_yaml(prediction_files) and prediction_files.startswith('pr_files'): + if load_yaml(prediction_files, keys_fix_yaml=self.keys_fix) and prediction_files.startswith('pr_files'): prediction_files = prediction_files.removeprefix('pr_files:').strip() file_description_str_list.append(prediction_files) else: @@ -304,16 +311,16 @@ async def _prepare_prediction(self, model: str) -> None: # final processing self.prediction = prediction_headers + "\n" + "pr_files:\n" + files_walkthrough - if not load_yaml(self.prediction): + if not load_yaml(self.prediction, keys_fix_yaml=self.keys_fix): get_logger().error(f"Error getting valid YAML in large PR handling for describe {self.pr_id}") - if load_yaml(prediction_headers): + if load_yaml(prediction_headers, keys_fix_yaml=self.keys_fix): get_logger().debug(f"Using only headers for describe {self.pr_id}") self.prediction = prediction_headers async def extend_additional_files(self, remaining_files_list) -> str: prediction = self.prediction try: - original_prediction_dict = load_yaml(self.prediction) + original_prediction_dict = load_yaml(self.prediction, keys_fix_yaml=self.keys_fix) prediction_extra = "pr_files:" for file in remaining_files_list: extra_file_yaml = f"""\ @@ -327,12 +334,12 @@ async def extend_additional_files(self, remaining_files_list) -> str: additional files (token-limit) """ prediction_extra = prediction_extra + "\n" + extra_file_yaml.strip() - prediction_extra_dict = load_yaml(prediction_extra) + prediction_extra_dict = load_yaml(prediction_extra, keys_fix_yaml=self.keys_fix) # merge the two dictionaries if isinstance(original_prediction_dict, dict) and isinstance(prediction_extra_dict, dict): original_prediction_dict["pr_files"].extend(prediction_extra_dict["pr_files"]) new_yaml = yaml.dump(original_prediction_dict) - if load_yaml(new_yaml): + if load_yaml(new_yaml, keys_fix_yaml=self.keys_fix): prediction = new_yaml return prediction except Exception as e: @@ -361,7 +368,7 @@ async def _get_prediction(self, model: str, patches_diff: str, prompt="pr_descri def _prepare_data(self): # Load the AI prediction data into a dictionary - self.data = load_yaml(self.prediction.strip()) + self.data = load_yaml(self.prediction.strip(), keys_fix_yaml=self.keys_fix) if get_settings().pr_description.add_original_user_description and self.user_description: self.data["User Description"] = self.user_description diff --git a/pr_agent/tools/pr_reviewer.py b/pr_agent/tools/pr_reviewer.py index 88799d987..f5f82e060 100644 --- a/pr_agent/tools/pr_reviewer.py +++ b/pr_agent/tools/pr_reviewer.py @@ -1,5 +1,6 @@ import copy import datetime +import traceback from collections import OrderedDict from functools import partial from typing import List, Tuple @@ -15,6 +16,7 @@ from pr_agent.git_providers.git_provider import IncrementalPR, get_main_pr_language from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage +from pr_agent.tools.ticket_pr_compliance_check import extract_tickets, extract_and_cache_pr_tickets class PRReviewer: @@ -84,6 +86,7 @@ def __init__(self, pr_url: str, is_answer: bool = False, is_auto: bool = False, "custom_labels": "", "enable_custom_labels": get_settings().config.enable_custom_labels, "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), + "related_tickets": get_settings().get('related_tickets', []), } self.token_handler = TokenHandler( @@ -121,6 +124,9 @@ async def run(self) -> None: 'config': dict(get_settings().config)} get_logger().debug("Relevant configs", artifacts=relevant_configs) + # ticket extraction if exists + await extract_and_cache_pr_tickets(self.git_provider, self.vars) + if self.incremental.is_incremental and hasattr(self.git_provider, "unreviewed_files_set") and not self.git_provider.unreviewed_files_set: get_logger().info(f"Incremental review is enabled for {self.pr_url} but there are no new files") previous_review_url = "" @@ -207,7 +213,7 @@ def _prepare_pr_review(self) -> str: first_key = 'review' last_key = 'security_concerns' data = load_yaml(self.prediction.strip(), - keys_fix_yaml=["estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", + keys_fix_yaml=["ticket_compliance_check", "estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", "relevant_file:", "relevant_line:", "suggestion:"], first_key=first_key, last_key=last_key) github_action_output(data, 'review') @@ -282,7 +288,7 @@ def _publish_inline_code_comments(self) -> None: first_key = 'review' last_key = 'security_concerns' data = load_yaml(self.prediction.strip(), - keys_fix_yaml=["estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", + keys_fix_yaml=["ticket_compliance_check", "estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", "relevant_file:", "relevant_line:", "suggestion:"], first_key=first_key, last_key=last_key) comments: List[str] = [] @@ -401,7 +407,16 @@ def set_review_labels(self, data): review_labels = [] if get_settings().pr_reviewer.enable_review_labels_effort: estimated_effort = data['review']['estimated_effort_to_review_[1-5]'] - estimated_effort_number = int(estimated_effort.split(',')[0]) + estimated_effort_number = 0 + if isinstance(estimated_effort, str): + try: + estimated_effort_number = int(estimated_effort.split(',')[0]) + except ValueError: + get_logger().warning(f"Invalid estimated_effort value: {estimated_effort}") + elif isinstance(estimated_effort, int): + estimated_effort_number = estimated_effort + else: + get_logger().warning(f"Unexpected type for estimated_effort: {type(estimated_effort)}") if 1 <= estimated_effort_number <= 5: # 1, because ... review_labels.append(f'Review effort [1-5]: {estimated_effort_number}') if get_settings().pr_reviewer.enable_review_labels_security and get_settings().pr_reviewer.require_security_review: From 76d95bb6d70425c0921d0329a518604fbc57668a Mon Sep 17 00:00:00 2001 From: mrT23 Date: Thu, 10 Oct 2024 08:48:37 +0300 Subject: [PATCH 3/3] feat: add ticket compliance check - Implement ticket compliance check logic in `utils.py` and `ticket_pr_compliance_check.py` - Add functions to extract and cache PR tickets, and check ticket relevancy --- README.md | 7 +- docs/docs/index.md | 1 - docs/docs/overview/index.md | 1 - docs/docs/overview/pr_agent_pro.md | 1 - docs/docs/tools/review.md | 17 +- pr_agent/algo/utils.py | 190 +++++++++++++----- pr_agent/settings/configuration.toml | 4 +- pr_agent/settings/pr_description_prompts.toml | 25 ++- pr_agent/settings/pr_reviewer_prompts.toml | 54 ++++- pr_agent/tools/pr_description.py | 21 +- pr_agent/tools/pr_reviewer.py | 21 +- pr_agent/tools/ticket_pr_compliance_check.py | 113 +++++++++++ 12 files changed, 367 insertions(+), 88 deletions(-) create mode 100644 pr_agent/tools/ticket_pr_compliance_check.py diff --git a/README.md b/README.md index cdbc5f721..d6ab33b38 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,12 @@ CodiumAI PR-Agent aims to help efficiently review and handle pull requests, by p ## News and Updates +### October 10, 2024 +New ability for the `review` tool - **ticket compliance feedback**. If the PR contains a ticket number, PR-Agent will check if the PR code actually complies with the ticket requirements. + + + + ### September 21, 2024 Need help with PR-Agent? New feature - simply comment `/help "your question"` in a pull request, and PR-Agent will provide you with the [relevant documentation](https://github.com/Codium-ai/pr-agent/pull/1241#issuecomment-2365259334). @@ -93,7 +99,6 @@ Supported commands per platform: |-------|---------------------------------------------------------------------------------------------------------|:--------------------:|:--------------------:|:--------------------:|:------------:| | TOOLS | Review | ✅ | ✅ | ✅ | ✅ | | | ⮑ Incremental | ✅ | | | | -| | ⮑ [SOC2 Compliance](https://pr-agent-docs.codium.ai/tools/review/#soc2-ticket-compliance) 💎 | ✅ | ✅ | ✅ | | | | Describe | ✅ | ✅ | ✅ | ✅ | | | ⮑ [Inline File Summary](https://pr-agent-docs.codium.ai/tools/describe#inline-file-summary) 💎 | ✅ | | | | | | Improve | ✅ | ✅ | ✅ | ✅ | diff --git a/docs/docs/index.md b/docs/docs/index.md index 3bc1ed4ba..e3f7518f4 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -29,7 +29,6 @@ Qodo Merge offers extensive pull request functionalities across various git prov |-------|-----------------------------------------------------------------------------------------------------------------------|:------:|:------:|:---------:|:------------:| | TOOLS | Review | ✅ | ✅ | ✅ | ✅ | | | ⮑ Incremental | ✅ | | | | -| | ⮑ [SOC2 Compliance](https://qodo-merge-docs.qodo.ai/tools/review/#soc2-ticket-compliance){:target="_blank"} 💎 | ✅ | ✅ | ✅ | | | | Ask | ✅ | ✅ | ✅ | ✅ | | | Describe | ✅ | ✅ | ✅ | ✅ | | | ⮑ [Inline file summary](https://qodo-merge-docs.qodo.ai/tools/describe/#inline-file-summary){:target="_blank"} 💎 | ✅ | ✅ | | | diff --git a/docs/docs/overview/index.md b/docs/docs/overview/index.md index d4330a95b..f56783bc8 100644 --- a/docs/docs/overview/index.md +++ b/docs/docs/overview/index.md @@ -29,7 +29,6 @@ Qodo Merge offers extensive pull request functionalities across various git prov |-------|-----------------------------------------------------------------------------------------------------------------------|:------:|:------:|:---------:|:------------:| | TOOLS | Review | ✅ | ✅ | ✅ | ✅ | | | ⮑ Incremental | ✅ | | | | -| | ⮑ [SOC2 Compliance](https://qodo-merge-docs.qodo.ai/tools/review/#soc2-ticket-compliance){:target="_blank"} 💎 | ✅ | ✅ | ✅ | ✅ | | | Ask | ✅ | ✅ | ✅ | ✅ | | | Describe | ✅ | ✅ | ✅ | ✅ | | | ⮑ [Inline file summary](https://qodo-merge-docs.qodo.ai/tools/describe/#inline-file-summary){:target="_blank"} 💎 | ✅ | ✅ | | ✅ | diff --git a/docs/docs/overview/pr_agent_pro.md b/docs/docs/overview/pr_agent_pro.md index e6e859cc4..731b0573e 100644 --- a/docs/docs/overview/pr_agent_pro.md +++ b/docs/docs/overview/pr_agent_pro.md @@ -27,7 +27,6 @@ Here are some of the additional features and capabilities that Qodo Merge Pro of | [**Advanced usage statistics**](https://www.codium.ai/contact/#/) | Qodo Merge Pro offers detailed statistics at user, repository, and company levels, including metrics about Qodo Merge usage, and also general statistics and insights | | [**Incorporating companies' best practices**](https://qodo-merge-docs.qodo.ai/tools/improve/#best-practices) | Use the companies' best practices as reference to increase the effectiveness and the relevance of the code suggestions | | [**Interactive triggering**](https://qodo-merge-docs.qodo.ai/tools/analyze/#example-usage) | Interactively apply different tools via the `analyze` command | -| [**SOC2 compliance check**](https://qodo-merge-docs.qodo.ai/tools/review/#configuration-options) | Ensures the PR contains a ticket to a project management system (e.g., Jira, Asana, Trello, etc.) | [**Custom labels**](https://qodo-merge-docs.qodo.ai/tools/describe/#handle-custom-labels-from-the-repos-labels-page) | Define custom labels for Qodo Merge to assign to the PR | ### Additional tools diff --git a/docs/docs/tools/review.md b/docs/docs/tools/review.md index 16765629b..2fd774f6e 100644 --- a/docs/docs/tools/review.md +++ b/docs/docs/tools/review.md @@ -138,20 +138,9 @@ num_code_suggestions = ... require_security_review If set to true, the tool will add a section that checks if the PR contains a possible security or vulnerability issue. Default is true. - - -!!! example "SOC2 ticket compliance 💎" - -This sub-tool checks if the PR description properly contains a ticket to a project management system (e.g., Jira, Asana, Trello, etc.), as required by SOC2 compliance. If not, it will add a label to the PR: "Missing SOC2 ticket". - - - - - - - - + +
require_soc2_ticketIf set to true, the SOC2 ticket checker sub-tool will be enabled. Default is false.
soc2_ticket_promptThe prompt for the SOC2 ticket review. Default is: `Does the PR description include a link to ticket in a project management system (e.g., Jira, Asana, Trello, etc.) ?`. Edit this field if your compliance requirements are different.require_ticket_analysis_reviewIf set to true, and the PR contains a GitHub ticket number, the tool will add a section that checks if the PR in fact fulfilled the ticket requirements. Default is true.
@@ -193,7 +182,7 @@ If enabled, the `review` tool can approve a PR when a specific comment, `/review It is recommended to review the [Configuration options](#configuration-options) section, and choose the relevant options for your use case. Some of the features that are disabled by default are quite useful, and should be considered for enabling. For example: - `require_score_review`, `require_soc2_ticket`, and more. + `require_score_review`, and more. On the other hand, if you find one of the enabled features to be irrelevant for your use case, disable it. No default configuration can fit all use cases. diff --git a/pr_agent/algo/utils.py b/pr_agent/algo/utils.py index bd0781ae7..98c69d007 100644 --- a/pr_agent/algo/utils.py +++ b/pr_agent/algo/utils.py @@ -1,18 +1,22 @@ from __future__ import annotations -import html2text -import html import copy import difflib +import hashlib +import html import json import os import re import textwrap import time +import traceback from datetime import datetime from enum import Enum from typing import Any, List, Tuple + +import html2text +import requests import yaml from pydantic import BaseModel from starlette_context import context @@ -110,6 +114,7 @@ def convert_to_markdown_v2(output_data: dict, "Insights from user's answers": "📝", "Code feedback": "🤖", "Estimated effort to review [1-5]": "⏱️", + "Ticket compliance check": "🎫", } markdown_text = "" if not incremental_review: @@ -165,6 +170,8 @@ def convert_to_markdown_v2(output_data: dict, markdown_text += f'### {emoji} No relevant tests\n\n' else: markdown_text += f"### PR contains tests\n\n" + elif 'ticket compliance check' in key_nice.lower(): + markdown_text = ticket_markdown_logic(emoji, markdown_text, value, gfm_supported) elif 'security concerns' in key_nice.lower(): if gfm_supported: markdown_text += f"" @@ -254,6 +261,52 @@ def convert_to_markdown_v2(output_data: dict, return markdown_text +def ticket_markdown_logic(emoji, markdown_text, value, gfm_supported) -> str: + ticket_compliance_str = "" + final_compliance_level = -1 + if isinstance(value, list): + for v in value: + ticket_url = v.get('ticket_url', '').strip() + compliance_level = v.get('overall_compliance_level', '').strip() + # add emojis, if 'Fully compliant' ✅, 'Partially compliant' 🔶, or 'Not compliant' ❌ + if compliance_level.lower() == 'fully compliant': + # compliance_level = '✅ Fully compliant' + final_compliance_level = 2 if final_compliance_level == -1 else 1 + elif compliance_level.lower() == 'partially compliant': + # compliance_level = '🔶 Partially compliant' + final_compliance_level = 1 + elif compliance_level.lower() == 'not compliant': + # compliance_level = '❌ Not compliant' + final_compliance_level = 0 if final_compliance_level < 1 else 1 + + # explanation = v.get('compliance_analysis', '').strip() + explanation = '' + fully_compliant_str = v.get('fully_compliant_requirements', '').strip() + not_compliant_str = v.get('not_compliant_requirements', '').strip() + if fully_compliant_str: + explanation += f"Fully compliant requirements:\n{fully_compliant_str}\n\n" + if not_compliant_str: + explanation += f"Not compliant requirements:\n{not_compliant_str}\n\n" + + ticket_compliance_str += f"\n\n**[{ticket_url.split('/')[-1]}]({ticket_url}) - {compliance_level}**\n\n{explanation}\n\n" + if final_compliance_level == 2: + compliance_level = '✅' + elif final_compliance_level == 1: + compliance_level = '🔶' + else: + compliance_level = '❌' + + if gfm_supported: + markdown_text += f"\n\n" + markdown_text += f"**{emoji} Ticket compliance analysis {compliance_level}**\n\n" + markdown_text += ticket_compliance_str + markdown_text += f"\n" + else: + markdown_text += f"### {emoji} Ticket compliance analysis {compliance_level}\n\n" + markdown_text += ticket_compliance_str+"\n\n" + return markdown_text + + def process_can_be_split(emoji, value): try: # key_nice = "Can this PR be split?" @@ -554,7 +607,8 @@ def load_yaml(response_text: str, keys_fix_yaml: List[str] = [], first_key="", l get_logger().warning(f"Initial failure to parse AI prediction: {e}") data = try_fix_yaml(response_text, keys_fix_yaml=keys_fix_yaml, first_key=first_key, last_key=last_key) if not data: - get_logger().error(f"Failed to parse AI prediction after fallbacks", artifact={'response_text': response_text}) + get_logger().error(f"Failed to parse AI prediction after fallbacks", + artifact={'response_text': response_text}) else: get_logger().info(f"Successfully parsed AI prediction after fallbacks", artifact={'response_text': response_text}) @@ -841,56 +895,64 @@ def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo], break return position, absolute_position -def validate_and_await_rate_limit(rate_limit_status=None, git_provider=None, get_rate_limit_status_func=None): - if git_provider and not rate_limit_status: - rate_limit_status = {'resources': git_provider.github_client.get_rate_limit().raw_data} +def get_rate_limit_status(github_token) -> dict: + GITHUB_API_URL = get_settings(use_context=False).get("GITHUB.BASE_URL", "https://api.github.com").rstrip("/") # "https://api.github.com" + # GITHUB_API_URL = "https://api.github.com" + RATE_LIMIT_URL = f"{GITHUB_API_URL}/rate_limit" + HEADERS = { + "Accept": "application/vnd.github.v3+json", + "Authorization": f"token {github_token}" + } - if not rate_limit_status: - rate_limit_status = get_rate_limit_status_func() + response = requests.get(RATE_LIMIT_URL, headers=HEADERS) + try: + rate_limit_info = response.json() + if rate_limit_info.get('message') == 'Rate limiting is not enabled.': # for github enterprise + return {'resources': {}} + response.raise_for_status() # Check for HTTP errors + except: # retry + time.sleep(0.1) + response = requests.get(RATE_LIMIT_URL, headers=HEADERS) + return response.json() + return rate_limit_info + + +def validate_rate_limit_github(github_token, installation_id=None, threshold=0.1) -> bool: + try: + rate_limit_status = get_rate_limit_status(github_token) + if installation_id: + get_logger().debug(f"installation_id: {installation_id}, Rate limit status: {rate_limit_status['rate']}") # validate that the rate limit is not exceeded - is_rate_limit = False - for key, value in rate_limit_status['resources'].items(): - if value['remaining'] == 0: - print(f"key: {key}, value: {value}") - is_rate_limit = True - sleep_time_sec = value['reset'] - datetime.now().timestamp() - sleep_time_hour = sleep_time_sec / 3600.0 - print(f"Rate limit exceeded. Sleeping for {sleep_time_hour} hours") - if sleep_time_sec > 0: - time.sleep(sleep_time_sec+1) - - if git_provider: - rate_limit_status = {'resources': git_provider.github_client.get_rate_limit().raw_data} - else: - rate_limit_status = get_rate_limit_status_func() - - return is_rate_limit - - -def get_largest_component(pr_url): - from pr_agent.tools.pr_analyzer import PRAnalyzer - publish_output = get_settings().config.publish_output - get_settings().config.publish_output = False # disable publish output - analyzer = PRAnalyzer(pr_url) - methods_dict_files = analyzer.run_sync() - get_settings().config.publish_output = publish_output - max_lines_changed = 0 - file_b = "" - component_name_b = "" - for file in methods_dict_files: - for method in methods_dict_files[file]: - try: - if methods_dict_files[file][method]['num_plus_lines'] > max_lines_changed: - max_lines_changed = methods_dict_files[file][method]['num_plus_lines'] - file_b = file - component_name_b = method - except: - pass - if component_name_b: - get_logger().info(f"Using the largest changed component: '{component_name_b}'") - return component_name_b, file_b - else: - return None, None + # validate that the rate limit is not exceeded + for key, value in rate_limit_status['resources'].items(): + if value['remaining'] < value['limit'] * threshold: + get_logger().error(f"key: {key}, value: {value}") + return False + return True + except Exception as e: + get_logger().error(f"Error in rate limit {e}", + artifact={"traceback": traceback.format_exc()}) + return True + + +def validate_and_await_rate_limit(github_token): + try: + rate_limit_status = get_rate_limit_status(github_token) + # validate that the rate limit is not exceeded + for key, value in rate_limit_status['resources'].items(): + if value['remaining'] < value['limit'] // 80: + get_logger().error(f"key: {key}, value: {value}") + sleep_time_sec = value['reset'] - datetime.now().timestamp() + sleep_time_hour = sleep_time_sec / 3600.0 + get_logger().error(f"Rate limit exceeded. Sleeping for {sleep_time_hour} hours") + if sleep_time_sec > 0: + time.sleep(sleep_time_sec + 1) + rate_limit_status = get_rate_limit_status(github_token) + return rate_limit_status + except: + get_logger().error("Error in rate limit") + return None + def github_action_output(output_data: dict, key_name: str): try: @@ -906,7 +968,7 @@ def github_action_output(output_data: dict, key_name: str): def show_relevant_configurations(relevant_section: str) -> str: - skip_keys = ['ai_disclaimer', 'ai_disclaimer_title', 'ANALYTICS_FOLDER', 'secret_provider', "skip_keys", + skip_keys = ['ai_disclaimer', 'ai_disclaimer_title', 'ANALYTICS_FOLDER', 'secret_provider', "skip_keys", "app_id", "redirect", 'trial_prefix_message', 'no_eligible_message', 'identity_provider', 'ALLOWED_REPOS','APP_NAME'] extra_skip_keys = get_settings().config.get('config.skip_keys', []) if extra_skip_keys: @@ -939,6 +1001,25 @@ def is_value_no(value): return False +def set_pr_string(repo_name, pr_number): + return f"{repo_name}#{pr_number}" + + +def string_to_uniform_number(s: str) -> float: + """ + Convert a string to a uniform number in the range [0, 1]. + The uniform distribution is achieved by the nature of the SHA-256 hash function, which produces a uniformly distributed hash value over its output space. + """ + # Generate a hash of the string + hash_object = hashlib.sha256(s.encode()) + # Convert the hash to an integer + hash_int = int(hash_object.hexdigest(), 16) + # Normalize the integer to the range [0, 1] + max_hash_int = 2 ** 256 - 1 + uniform_number = float(hash_int) / max_hash_int + return uniform_number + + def process_description(description_full: str) -> Tuple[str, List]: if not description_full: return "", [] @@ -998,7 +1079,10 @@ def process_description(description_full: str) -> Tuple[str, List]: 'long_summary': long_summary }) else: - get_logger().error(f"Failed to parse description", artifact={'description': file_data}) + if '...' in file_data: + pass # PR with many files. some did not get analyzed + else: + get_logger().error(f"Failed to parse description", artifact={'description': file_data}) except Exception as e: get_logger().exception(f"Failed to process description: {e}", artifact={'description': file_data}) diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index add06bb6b..d4ecf3794 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -51,9 +51,7 @@ require_tests_review=true require_estimate_effort_to_review=true require_can_be_split_review=false require_security_review=true -# soc2 -require_soc2_ticket=false -soc2_ticket_prompt="Does the PR description include a link to ticket in a project management system (e.g., Jira, Asana, Trello, etc.) ?" +require_ticket_analysis_review=true # general options num_code_suggestions=0 inline_code_comments = false diff --git a/pr_agent/settings/pr_description_prompts.toml b/pr_agent/settings/pr_description_prompts.toml index de7c3d548..364dd9af7 100644 --- a/pr_agent/settings/pr_description_prompts.toml +++ b/pr_agent/settings/pr_description_prompts.toml @@ -78,9 +78,9 @@ pr_files: ... ... {%- endif %} -description: |- +description: | ... -title: |- +title: | ... {%- if enable_custom_labels %} labels: @@ -94,7 +94,26 @@ labels: Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|') """ -user="""PR Info: +user=""" +{%- if related_tickets %} +Related Ticket Info: +{% for ticket in related_tickets %} +===== +Ticket Title: '{{ ticket.title }}' +{%- if ticket.labels %} +Ticket Labels: {{ ticket.labels }} +{%- endif %} +{%- if ticket.body %} +Ticket Description: +##### +{{ ticket.body }} +##### +{%- endif %} +===== +{% endfor %} +{%- endif %} + +PR Info: Previous title: '{{title}}' diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml index e3b4bfe48..0b61e8ea5 100644 --- a/pr_agent/settings/pr_reviewer_prompts.toml +++ b/pr_agent/settings/pr_reviewer_prompts.toml @@ -85,7 +85,20 @@ class KeyIssuesComponentLink(BaseModel): start_line: int = Field(description="The start line that corresponds to this issue in the relevant file") end_line: int = Field(description="The end line that corresponds to this issue in the relevant file") +{%- if related_tickets %} + +class TicketCompliance(BaseModel): + ticket_url: str = Field(description="Ticket URL or ID") + ticket_requirements: str = Field(description="Repeat, in your own words, all ticket requirements, in bullet points") + fully_compliant_requirements: str = Field(description="A list, in bullet points, of which requirements are met by the PR code. Don't explain how the requirements are met, just list them shortly. Can be empty") + not_compliant_requirements: str = Field(description="A list, in bullet points, of which requirements are not met by the PR code. Don't explain how the requirements are not met, just list them shortly. Can be empty") + overall_compliance_level: str = Field(description="Overall give this PR one of these three values in relation to the ticket: 'Fully compliant', 'Partially compliant', or 'Not compliant'") +{%- endif %} + class Review(BaseModel): +{%- if related_tickets %} + ticket_compliance_check: List[TicketCompliance] = Field(description="A list of compliance checks for the related tickets") +{%- endif %} {%- if require_estimate_effort_to_review %} estimated_effort_to_review_[1-5]: int = Field(description="Estimate, on a scale of 1-5 (inclusive), the time and effort required to review this PR by an experienced and knowledgeable developer. 1 means short and easy review , 5 means long and hard review. Take into account the size, complexity, quality, and the needed changes of the PR code diff.") {%- endif %} @@ -130,6 +143,19 @@ class PRReview(BaseModel): Example output: ```yaml review: +{%- if related_tickets %} + ticket_compliance_check: + - ticket_url: | + ... + ticket_requirements: | + ... + fully_compliant_requirements: | + ... + not_compliant_requirements: | + ... + overall_compliance_level: | + ... +{%- endif %} {%- if require_estimate_effort_to_review %} estimated_effort_to_review_[1-5]: | 3 @@ -176,7 +202,33 @@ code_feedback: Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|') """ -user="""--PR Info-- +user=""" +{%- if related_tickets %} +--PR Ticket Info-- +{%- for ticket in related_tickets %} +===== +Ticket URL: '{{ ticket.ticket_url }}' + +Ticket Title: '{{ ticket.title }}' + +{%- if ticket.labels %} + +Ticket Labels: {{ ticket.labels }} + +{%- endif %} +{%- if ticket.body %} + +Ticket Description: +##### +{{ ticket.body }} +##### +{%- endif %} +===== +{% endfor %} +{%- endif %} + + +--PR Info-- Title: '{{title}}' diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py index 9f7d79d31..c965d84e7 100644 --- a/pr_agent/tools/pr_description.py +++ b/pr_agent/tools/pr_description.py @@ -20,6 +20,8 @@ from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage +from pr_agent.tools.ticket_pr_compliance_check import extract_ticket_links_from_pr_description, extract_tickets, \ + extract_and_cache_pr_tickets class PRDescription: @@ -38,6 +40,7 @@ def __init__(self, pr_url: str, args: list = None, self.git_provider.get_languages(), self.git_provider.get_files() ) self.pr_id = self.git_provider.get_pr_id() + self.keys_fix = ["filename:", "language:", "changes_summary:", "changes_title:", "description:", "title:"] if get_settings().pr_description.enable_semantic_files_types and not self.git_provider.is_supported( "gfm_markdown"): @@ -60,6 +63,7 @@ def __init__(self, pr_url: str, args: list = None, "enable_custom_labels": get_settings().config.enable_custom_labels, "custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function "enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types, + "related_tickets": "", } self.user_description = self.git_provider.get_user_description() @@ -87,6 +91,9 @@ async def run(self): if get_settings().config.publish_output and not get_settings().config.get('is_auto_command', False): self.git_provider.publish_comment("Preparing PR description...", is_temporary=True) + # ticket extraction if exists + await extract_and_cache_pr_tickets(self.git_provider, self.vars) + await retry_with_fallback_models(self._prepare_prediction, ModelType.TURBO) if self.prediction: @@ -226,7 +233,7 @@ async def _prepare_prediction(self, model: str) -> None: file_description_str_list = [] for i, result in enumerate(results): prediction_files = result.strip().removeprefix('```yaml').strip('`').strip() - if load_yaml(prediction_files) and prediction_files.startswith('pr_files'): + if load_yaml(prediction_files, keys_fix_yaml=self.keys_fix) and prediction_files.startswith('pr_files'): prediction_files = prediction_files.removeprefix('pr_files:').strip() file_description_str_list.append(prediction_files) else: @@ -304,16 +311,16 @@ async def _prepare_prediction(self, model: str) -> None: # final processing self.prediction = prediction_headers + "\n" + "pr_files:\n" + files_walkthrough - if not load_yaml(self.prediction): + if not load_yaml(self.prediction, keys_fix_yaml=self.keys_fix): get_logger().error(f"Error getting valid YAML in large PR handling for describe {self.pr_id}") - if load_yaml(prediction_headers): + if load_yaml(prediction_headers, keys_fix_yaml=self.keys_fix): get_logger().debug(f"Using only headers for describe {self.pr_id}") self.prediction = prediction_headers async def extend_additional_files(self, remaining_files_list) -> str: prediction = self.prediction try: - original_prediction_dict = load_yaml(self.prediction) + original_prediction_dict = load_yaml(self.prediction, keys_fix_yaml=self.keys_fix) prediction_extra = "pr_files:" for file in remaining_files_list: extra_file_yaml = f"""\ @@ -327,12 +334,12 @@ async def extend_additional_files(self, remaining_files_list) -> str: additional files (token-limit) """ prediction_extra = prediction_extra + "\n" + extra_file_yaml.strip() - prediction_extra_dict = load_yaml(prediction_extra) + prediction_extra_dict = load_yaml(prediction_extra, keys_fix_yaml=self.keys_fix) # merge the two dictionaries if isinstance(original_prediction_dict, dict) and isinstance(prediction_extra_dict, dict): original_prediction_dict["pr_files"].extend(prediction_extra_dict["pr_files"]) new_yaml = yaml.dump(original_prediction_dict) - if load_yaml(new_yaml): + if load_yaml(new_yaml, keys_fix_yaml=self.keys_fix): prediction = new_yaml return prediction except Exception as e: @@ -361,7 +368,7 @@ async def _get_prediction(self, model: str, patches_diff: str, prompt="pr_descri def _prepare_data(self): # Load the AI prediction data into a dictionary - self.data = load_yaml(self.prediction.strip()) + self.data = load_yaml(self.prediction.strip(), keys_fix_yaml=self.keys_fix) if get_settings().pr_description.add_original_user_description and self.user_description: self.data["User Description"] = self.user_description diff --git a/pr_agent/tools/pr_reviewer.py b/pr_agent/tools/pr_reviewer.py index 88799d987..f5f82e060 100644 --- a/pr_agent/tools/pr_reviewer.py +++ b/pr_agent/tools/pr_reviewer.py @@ -1,5 +1,6 @@ import copy import datetime +import traceback from collections import OrderedDict from functools import partial from typing import List, Tuple @@ -15,6 +16,7 @@ from pr_agent.git_providers.git_provider import IncrementalPR, get_main_pr_language from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage +from pr_agent.tools.ticket_pr_compliance_check import extract_tickets, extract_and_cache_pr_tickets class PRReviewer: @@ -84,6 +86,7 @@ def __init__(self, pr_url: str, is_answer: bool = False, is_auto: bool = False, "custom_labels": "", "enable_custom_labels": get_settings().config.enable_custom_labels, "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), + "related_tickets": get_settings().get('related_tickets', []), } self.token_handler = TokenHandler( @@ -121,6 +124,9 @@ async def run(self) -> None: 'config': dict(get_settings().config)} get_logger().debug("Relevant configs", artifacts=relevant_configs) + # ticket extraction if exists + await extract_and_cache_pr_tickets(self.git_provider, self.vars) + if self.incremental.is_incremental and hasattr(self.git_provider, "unreviewed_files_set") and not self.git_provider.unreviewed_files_set: get_logger().info(f"Incremental review is enabled for {self.pr_url} but there are no new files") previous_review_url = "" @@ -207,7 +213,7 @@ def _prepare_pr_review(self) -> str: first_key = 'review' last_key = 'security_concerns' data = load_yaml(self.prediction.strip(), - keys_fix_yaml=["estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", + keys_fix_yaml=["ticket_compliance_check", "estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", "relevant_file:", "relevant_line:", "suggestion:"], first_key=first_key, last_key=last_key) github_action_output(data, 'review') @@ -282,7 +288,7 @@ def _publish_inline_code_comments(self) -> None: first_key = 'review' last_key = 'security_concerns' data = load_yaml(self.prediction.strip(), - keys_fix_yaml=["estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", + keys_fix_yaml=["ticket_compliance_check", "estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", "relevant_file:", "relevant_line:", "suggestion:"], first_key=first_key, last_key=last_key) comments: List[str] = [] @@ -401,7 +407,16 @@ def set_review_labels(self, data): review_labels = [] if get_settings().pr_reviewer.enable_review_labels_effort: estimated_effort = data['review']['estimated_effort_to_review_[1-5]'] - estimated_effort_number = int(estimated_effort.split(',')[0]) + estimated_effort_number = 0 + if isinstance(estimated_effort, str): + try: + estimated_effort_number = int(estimated_effort.split(',')[0]) + except ValueError: + get_logger().warning(f"Invalid estimated_effort value: {estimated_effort}") + elif isinstance(estimated_effort, int): + estimated_effort_number = estimated_effort + else: + get_logger().warning(f"Unexpected type for estimated_effort: {type(estimated_effort)}") if 1 <= estimated_effort_number <= 5: # 1, because ... review_labels.append(f'Review effort [1-5]: {estimated_effort_number}') if get_settings().pr_reviewer.enable_review_labels_security and get_settings().pr_reviewer.require_security_review: diff --git a/pr_agent/tools/ticket_pr_compliance_check.py b/pr_agent/tools/ticket_pr_compliance_check.py new file mode 100644 index 000000000..03fdc88b5 --- /dev/null +++ b/pr_agent/tools/ticket_pr_compliance_check.py @@ -0,0 +1,113 @@ +import re +import traceback + +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import GithubProvider +from pr_agent.log import get_logger + + +def find_jira_tickets(text): + # Regular expression patterns for JIRA tickets + patterns = [ + r'\b[A-Z]{2,10}-\d{1,7}\b', # Standard JIRA ticket format (e.g., PROJ-123) + r'(?:https?://[^\s/]+/browse/)?([A-Z]{2,10}-\d{1,7})\b' # JIRA URL or just the ticket + ] + + tickets = set() + for pattern in patterns: + matches = re.findall(pattern, text) + for match in matches: + if isinstance(match, tuple): + # If it's a tuple (from the URL pattern), take the last non-empty group + ticket = next((m for m in reversed(match) if m), None) + else: + ticket = match + if ticket: + tickets.add(ticket) + + return list(tickets) + + +def extract_ticket_links_from_pr_description(pr_description, repo_path): + """ + Extract all ticket links from PR description + """ + + # example link to search for: https://github.com/Codium-ai/pr-agent-pro/issues/525 + pattern = r'https://github[^/]+/[^/]+/[^/]+/issues/\d+' # should support also github server (for example 'https://github.company.ai/Codium-ai/pr-agent-pro/issues/525') + + # Find all matches in the text + github_tickets = re.findall(pattern, pr_description) + + # Find all issues referenced like #123 and add them as https://github.com/{repo_path}/issues/{issue_number} + # (unneeded, since when you pull the actual comment, it appears as a full link) + # issue_number_pattern = r'#\d+' + # issue_numbers = re.findall(issue_number_pattern, pr_description) + # for issue_number in issue_numbers: + # issue_number = issue_number[1:] # remove # + # # check if issue_number is a valid number and len(issue_number) < 5 + # if issue_number.isdigit() and len(issue_number) < 5: + # github_tickets.append(f'https://github.com/{repo_path}/issues/{issue_number}') + + return github_tickets + + +async def extract_tickets(git_provider): + MAX_TICKET_CHARACTERS = 10000 + try: + if isinstance(git_provider, GithubProvider): + user_description = git_provider.get_user_description() + tickets = extract_ticket_links_from_pr_description(user_description, git_provider.repo) + tickets_content = [] + if tickets: + for ticket in tickets: + # extract ticket number and repo name + repo_name, original_issue_number = git_provider._parse_issue_url(ticket) + + # get the ticket object + issue_main = git_provider.repo_obj.get_issue(original_issue_number) + + # clip issue_main.body max length + issue_body = issue_main.body + if len(issue_main.body) > MAX_TICKET_CHARACTERS: + issue_body = issue_main.body[:MAX_TICKET_CHARACTERS] + "..." + + # extract labels + labels = [] + try: + for label in issue_main.labels: + if isinstance(label, str): + labels.append(label) + else: + labels.append(label.name) + except Exception as e: + get_logger().error(f"Error extracting labels error= {e}", + artifact={"traceback": traceback.format_exc()}) + tickets_content.append( + {'ticket_id': issue_main.number, + 'ticket_url': ticket, 'title': issue_main.title, 'body': issue_body, + 'labels': ", ".join(labels)}) + return tickets_content + + except Exception as e: + get_logger().error(f"Error extracting tickets error= {e}", + artifact={"traceback": traceback.format_exc()}) + + +async def extract_and_cache_pr_tickets(git_provider, vars): + if get_settings().get('config.require_ticket_analysis_review', False): + return + related_tickets = get_settings().get('related_tickets', []) + if not related_tickets: + tickets_content = await extract_tickets(git_provider) + if tickets_content: + get_logger().info("Extracted tickets from PR description", artifact={"tickets": tickets_content}) + vars['related_tickets'] = tickets_content + get_settings().set('related_tickets', tickets_content) + else: # if tickets are already cached + get_logger().info("Using cached tickets", artifact={"tickets": related_tickets}) + vars['related_tickets'] = related_tickets + + +def check_tickets_relevancy(): + return True