diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..ddc8296 --- /dev/null +++ b/.flake8 @@ -0,0 +1,8 @@ +[flake8] +max-complexity = 15 + +ignore = + # Line break after binary operator + W503, W504 + # Line too long + E501, diff --git a/.github/workflows/check_n_push_image.yml b/.github/workflows/check_n_push_image.yml index a428bce..1a6adf7 100644 --- a/.github/workflows/check_n_push_image.yml +++ b/.github/workflows/check_n_push_image.yml @@ -2,6 +2,8 @@ name: Check source code and push created image based on sources on: push: + branches: + - main paths-ignore: - 'docs/**' - '**.md' @@ -24,12 +26,10 @@ jobs: with: python-version: 3.8 - - name: Lint with flake8 + - name: Lint with flake8 and isort run: | - pip install flake8 flake8-bugbear flake8-comprehensions mccabe - make substitute-sources - flake8 src/ --statistic --max-line-length=80 --max-complexity 15 -qq - flake8 test/ --statistic --max-line-length=120 --max-complexity 15 -qq + pip install pre-commit==2.20.0 + make substitute-sources pre-commit docker-build-test-autotest: runs-on: ubuntu-20.04 @@ -65,7 +65,7 @@ jobs: if-no-files-found: error - name: Upload created image - if: ${{ github.ref == 'refs/heads/main' }} + if: ${{ startsWith(github.event.ref, 'refs/tags/v') }} uses: actions/upload-artifact@v3 with: name: codeplag-ubuntu20.04 @@ -76,7 +76,7 @@ jobs: push-image: runs-on: ubuntu-20.04 needs: [check-code, docker-build-test-autotest] - if: github.ref == 'refs/heads/main' + if: startsWith(github.event.ref, 'refs/tags/v') steps: - name: Checkout diff --git a/.gitignore b/.gitignore index d5268d5..6586f47 100644 --- a/.gitignore +++ b/.gitignore @@ -30,7 +30,6 @@ debian/copyright # Substituting src/codeplag/consts.py -src/webparsers/consts.py docker/base_ubuntu2004.dockerfile docker/test_ubuntu2004.dockerfile docker/ubuntu2004.dockerfile diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7e89eba --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +default_language_version: + python: python3.8 +repos: + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + additional_dependencies: + - flake8-bugbear==22.8.23 + - flake8-comprehensions==3.10.0 + - flake8-simplify==0.19.3 + - mccabe==0.7.0 \ No newline at end of file diff --git a/Makefile b/Makefile index 3647fdf..84d66e5 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -UTIL_VERSION := 0.2.2 +UTIL_VERSION := 0.2.3 UTIL_NAME := codeplag BASE_DOCKER_TAG := $(shell echo $(UTIL_NAME)-base-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z) @@ -6,12 +6,10 @@ TEST_DOCKER_TAG := $(shell echo $(UTIL_NAME)-test-ubuntu20.04:$(UTIL_VER DOCKER_TAG ?= $(shell echo $(UTIL_NAME)-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z) PWD := $(shell pwd) -PYTHONPATH = $(PWD)/src/ +PYTHONPATH := $(PWD)/src/ LOGS_PATH := /var/log/codeplag CODEPLAG_LOG_PATH := $(LOGS_PATH)/$(UTIL_NAME).log -WEBPARSERS_LOG_PATH := $(LOGS_PATH)/webparsers.log -SOURCE_SUB_FILES := src/codeplag/consts.py \ - src/webparsers/consts.py +SOURCE_SUB_FILES := src/codeplag/consts.py DEBIAN_SUB_FILES := debian/changelog \ debian/control \ debian/preinst \ @@ -30,7 +28,6 @@ all: substitute-sources man install sed \ -e "s|@UTIL_NAME@|${UTIL_NAME}|g" \ -e "s|@UTIL_VERSION@|${UTIL_VERSION}|g" \ - -e "s|@WEBPARSERS_LOG_PATH@|${WEBPARSERS_LOG_PATH}|g" \ -e "s|@CODEPLAG_LOG_PATH@|${CODEPLAG_LOG_PATH}|g" \ -e "s|@PYTHON_REQUIRED_LIBS@|${PYTHON_REQUIRED_LIBS}|g" \ -e "s|@LOGS_PATH@|${LOGS_PATH}|g" \ @@ -78,6 +75,9 @@ autotest: pytest test/auto -q make clean-cache +pre-commit: + pre-commit run --all-files + clean-cache: find . -maxdepth 1 -type d | grep -E "pytest_cache" | (xargs rm -r 2> /dev/null || exit 0) find . -type d | grep -E "__pycache__" | (xargs rm -r 2> /dev/null || exit 0) @@ -97,7 +97,6 @@ clean: clean-cache clean-all: clean rm --force src/codeplag/consts.py - rm --force src/webparsers/consts.py rm --force docker/base_ubuntu2004.dockerfile rm --force docker/test_ubuntu2004.dockerfile diff --git a/docker/test_ubuntu2004.dockerfile.in b/docker/test_ubuntu2004.dockerfile.in index 75cc2b9..0b3cbcf 100644 --- a/docker/test_ubuntu2004.dockerfile.in +++ b/docker/test_ubuntu2004.dockerfile.in @@ -3,7 +3,7 @@ ENV DEBIAN_FRONTEND=noninteractive ADD debian/ /usr/src/@UTIL_NAME@/debian RUN apt-get install -y debhelper -RUN pip3 install argparse-manpage==3 pytest==7.1.2 +RUN pip3 install argparse-manpage==3 pytest==7.1.2 pytest-mock==3.8.2 RUN mkdir -p @LOGS_PATH@ CMD make test diff --git a/docs/notebooks/utils.py b/docs/notebooks/utils.py index b59bb36..0726ee0 100644 --- a/docs/notebooks/utils.py +++ b/docs/notebooks/utils.py @@ -6,12 +6,13 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from decouple import Config, RepositoryEnv +from scipy.optimize import curve_fit + from codeplag.algorithms.featurebased import counter_metric, struct_compare from codeplag.algorithms.stringbased import gst from codeplag.algorithms.tokenbased import value_jakkar_coef from codeplag.pyplag.utils import get_ast_from_content, get_features_from_ast -from decouple import Config, RepositoryEnv -from scipy.optimize import curve_fit from webparsers.github_parser import GitHubParser @@ -74,15 +75,15 @@ def get_time_to_meta(df, iterations=10): to_meta_time = [] for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows(): print(index, " " * 20, end='\r') - for i in range(iterations): + for _ in range(iterations): tree = get_ast_from_content(content[0], content[1]) try: start = perf_counter() - features1 = get_features_from_ast(tree) + get_features_from_ast(tree) end = perf_counter() - start to_meta_time.append(end) count_lines.append(content[2]) - except: + except Exception: break output = pd.DataFrame( @@ -115,19 +116,19 @@ def plot_and_save_result(df, xlabel, ylabel, title, what, if trend == 'linear': z = np.polyfit(unique_count_lines, mean_times, 1) p = np.poly1d(z) - plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Линейный тренд.') + plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.') elif trend == 'n^2': popt_cons, _ = curve_fit(square_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100])) p = np.poly1d(popt_cons) - plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Квадратичный тренд.') + plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Квадратичный тренд.') elif trend == 'n^3': popt_cons, _ = curve_fit(cube_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100])) p = np.poly1d(popt_cons) - plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='Кубический тренд.') + plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Кубический тренд.') elif trend == 'n^4': popt_cons, _ = curve_fit(quart_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100])) p = np.poly1d(popt_cons) - plt.plot(unique_count_lines, p(unique_count_lines),"r--", label='n^4.') + plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='n^4.') rolling = pd.DataFrame( { @@ -151,26 +152,26 @@ def get_time_algorithms(df, work, iterations=5, metric='fast'): tree1 = get_ast_from_content(work.content, work.link) features1 = get_features_from_ast(tree1) for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows(): - for iteration in range(iterations): + for _ in range(iterations): print(index, " " * 20, end='\r') tree2 = get_ast_from_content(content[0], content[1]) try: features2 = get_features_from_ast(tree2) - except: + except Exception: continue if metric == 'fast': start = perf_counter() - jakkar_coef = value_jakkar_coef(features1.tokens, features2.tokens) - ops_res = counter_metric(features1.operators, features2.operators) - kw_res = counter_metric(features1.keywords, features2.keywords) - lits_res = counter_metric(features1.literals, features2.literals) - end = perf_counter() - start + value_jakkar_coef(features1.tokens, features2.tokens) + counter_metric(features1.operators, features2.operators) + counter_metric(features1.keywords, features2.keywords) + counter_metric(features1.literals, features2.literals) + end = perf_counter() - start times.append(end) elif metric == 'gst': start = perf_counter() gst(features1.tokens, features2.tokens, 6) - end = perf_counter() - start + end = perf_counter() - start times.append(end) elif metric == 'structure': start = perf_counter() diff --git a/src/codeplag/algorithms/featurebased.py b/src/codeplag/algorithms/featurebased.py index ae471b1..6800ca3 100644 --- a/src/codeplag/algorithms/featurebased.py +++ b/src/codeplag/algorithms/featurebased.py @@ -95,12 +95,10 @@ def get_children_indexes(tree: List[Tuple[int, int]], if count_of_nodes != 0: current_level = tree[0][0] - current_index = 0 - for node in tree: + for current_index, node in enumerate(tree): if current_level == node[0]: indexes.append(current_index) count_of_children += 1 - current_index += 1 return indexes, count_of_children diff --git a/src/codeplag/algorithms/stringbased.py b/src/codeplag/algorithms/stringbased.py index a73b250..44f78f7 100644 --- a/src/codeplag/algorithms/stringbased.py +++ b/src/codeplag/algorithms/stringbased.py @@ -1,3 +1,5 @@ +from typing import List + import numpy as np @@ -20,7 +22,7 @@ def m(symbol1, symbol2): ''' return 0 if symbol1 == symbol2 else 1 - def calculate_distance_matrix(self): + def calculate_distance_matrix(self) -> np.int64: ''' The function calculates the Levenshtein matrix and sets in the distance atribute minimal count of operations @@ -55,7 +57,8 @@ def get_similarity_value(self): return 1.0 - self.distance / max(self.s1_length, self.s2_length) -def is_marked_match(marked_string_list, begin, length): +def is_marked_match(marked_string_list: List[int], + begin: int, length: int) -> bool: """The function returns true if the match consists in the marked list, else false. @@ -64,11 +67,12 @@ def is_marked_match(marked_string_list, begin, length): @length - length of match """ - if begin in marked_string_list or \ - (begin + length - 1) in marked_string_list: - return True - else: - return False + condition = ( + begin in marked_string_list or + (begin + length - 1) in marked_string_list + ) + + return condition def gst(sequence1, sequence2, min_match_len=6): diff --git a/src/codeplag/algorithms/tokenbased.py b/src/codeplag/algorithms/tokenbased.py index dc88a88..6ce8e71 100644 --- a/src/codeplag/algorithms/tokenbased.py +++ b/src/codeplag/algorithms/tokenbased.py @@ -8,38 +8,46 @@ import math +from typing import List, Sequence, Set, Tuple, Union -def generate_ngrams(tokens, n=3, hashit=False, unique=False): +def generate_ngrams(tokens: Sequence[int], + n: int = 3, + hashit: bool = False, + unique: bool = False) -> Union[Set[int], + List[int], + Set[Tuple[int, ...]], + List[Tuple[int, ...]]]: """The function returns a list or set of N-grams or list or set of hashes of ngrams and may use to generate shingles. @param tokens - list of tokens - @param n - count of elements in sequences + @param n - count of elements in ngrams @param hashit - If is True, then the function returns a list or set of hashes of N-grams @param unique - If is True, then the function returns a set of N-grams or hashes of N-grams """ + count_tokens = len(tokens) if hashit: if unique: return { hash(tuple(tokens[i:i + n])) - for i in range(len(tokens) - n + 1) + for i in range(count_tokens - n + 1) } return [ hash(tuple(tokens[i:i + n])) - for i in range(len(tokens) - n + 1) + for i in range(count_tokens - n + 1) ] if unique: - return {tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)} + return {tuple(tokens[i:i + n]) for i in range(count_tokens - n + 1)} - return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)] + return [tuple(tokens[i:i + n]) for i in range(count_tokens - n + 1)] -def get_imprints_from_hashes(hashes): +def get_imprints_from_hashes(hashes: Sequence[int]) -> List[int]: """The function return imprints of the given hashes @param hashes - list of hashes @@ -54,14 +62,20 @@ def get_imprints_from_hashes(hashes): return [hashes[index] for index in range(0, count_hashes, k)] -def value_jakkar_coef(tokens_first, tokens_second, ngrams_length=3): +def value_jakkar_coef(tokens_first: Sequence[int], + tokens_second: Sequence[int], + ngrams_length: int = 3) -> float: ''' The function returns the value of the Jakkar coefficient @param tokens_first - list of tokens of the first program @param tokens_second - list of tokens of the second program ''' - ngrams_first = generate_ngrams(tokens_first, ngrams_length, unique=True) - ngrams_second = generate_ngrams(tokens_second, ngrams_length, unique=True) + ngrams_first: Set[Tuple[int, ...]] = generate_ngrams( + tokens_first, ngrams_length, unique=True + ) + ngrams_second: Set[Tuple[int, ...]] = generate_ngrams( + tokens_second, ngrams_length, unique=True + ) intersection = len(ngrams_first.intersection(ngrams_second)) union = len(ngrams_first | ngrams_second) @@ -73,7 +87,7 @@ def value_jakkar_coef(tokens_first, tokens_second, ngrams_length=3): # equal to the Levenshtein length -def lcs(X, Y): +def lcs(X: Sequence[int], Y: Sequence[int]) -> int: ''' The function returns the length of the longest common subsequence of two sequences X and Y. @@ -94,15 +108,15 @@ def lcs(X, Y): for j in range(n + 1): if i == 0 or j == 0: L[i][j] = 0 - elif X[i-1] == Y[j-1]: - L[i][j] = L[i-1][j-1] + 1 + elif X[i - 1] == Y[j - 1]: + L[i][j] = L[i - 1][j - 1] + 1 else: - L[i][j] = max(L[i-1][j], L[i][j-1]) + L[i][j] = max(L[i - 1][j], L[i][j - 1]) return L[m][n] -def lcs_based_coeff(subseq1, subseq2): +def lcs_based_coeff(subseq1: Sequence[int], subseq2: Sequence[int]) -> float: """The function returns coefficient based on the length of the longest common subsequence. This coefficient describes how same two sequences. diff --git a/src/codeplag/astfeatures.py b/src/codeplag/astfeatures.py index c778679..9567ac3 100644 --- a/src/codeplag/astfeatures.py +++ b/src/codeplag/astfeatures.py @@ -1,19 +1,24 @@ -# TODO: Use NamedDict +from collections import defaultdict +from typing import Dict, List + +from codeplag.types import NodeCodePlace, NodeStructurePlace + + class ASTFeatures: - def __init__(self, filepath=''): - self.filepath = filepath + def __init__(self, filepath: str): + self.filepath: str = filepath self.count_of_nodes = 0 - self.head_nodes = [] - self.operators = {} - self.keywords = {} - self.literals = {} + self.head_nodes: List[str] = [] + self.operators: Dict[str, int] = defaultdict(lambda: 0) + self.keywords: Dict[str, int] = defaultdict(lambda: 0) + self.literals: Dict[str, int] = defaultdict(lambda: 0) # unique nodes - self.unodes = {} - self.from_num = {} - self.count_unodes = 0 + self.unodes: Dict[str, int] = {} + self.from_num: Dict[int, str] = {} + self.count_unodes: int = 0 - self.structure = [] - self.tokens = [] - self.tokens_pos = [] + self.structure: List[NodeStructurePlace] = [] + self.tokens: List[int] = [] + self.tokens_pos: List[NodeCodePlace] = [] diff --git a/src/codeplag/codeplagcli.py b/src/codeplag/codeplagcli.py index 9f43558..11cc8e4 100644 --- a/src/codeplag/codeplagcli.py +++ b/src/codeplag/codeplagcli.py @@ -79,6 +79,12 @@ def __init__(self): choices=["many_to_many"], default="many_to_many" ) + self.add_argument( + '-rd', '--reports_directory', + help="If defined, then saves reports about suspect works " + "into provided path.", + type=DirPath + ) self.add_argument( "-sp", "--show_progress", help="Show current progress of searching plagiarism.", diff --git a/src/codeplag/consts.py.in b/src/codeplag/consts.py.in index 549888d..cb4f93a 100644 --- a/src/codeplag/consts.py.in +++ b/src/codeplag/consts.py.in @@ -1,17 +1,21 @@ -FILE_DOWNLOAD_PATH = "/tmp/@UTIL_NAME@_download.out" -LOG_PATH = "@CODEPLAG_LOG_PATH@" -SUPPORTED_EXTENSIONS = { +import re +from typing import Dict + +FILE_DOWNLOAD_PATH: str = "/tmp/@UTIL_NAME@_download.out" +GET_FRAZE: str = 'Getting works features from' +LOG_PATH: str = "@CODEPLAG_LOG_PATH@" +SUPPORTED_EXTENSIONS: Dict[str, tuple] = { 'py': ( - r'\.py$', + re.compile(r'\.py$'), ), 'cpp': ( - r'\.cpp$', - r'\.c$', - r'\.h$' + re.compile(r'\.cpp$'), + re.compile(r'\.c$'), + re.compile(r'\.h$') ), 'default': ( - r'\..*$', + re.compile(r'\..*$'), ) } -UTIL_NAME = "@UTIL_NAME@" -UTIL_VERSION = "@UTIL_VERSION@" +UTIL_NAME: str = "@UTIL_NAME@" +UTIL_VERSION: str = "@UTIL_VERSION@" diff --git a/src/codeplag/cplag/tree.py b/src/codeplag/cplag/tree.py index aecc835..bb9fa34 100644 --- a/src/codeplag/cplag/tree.py +++ b/src/codeplag/cplag/tree.py @@ -6,20 +6,19 @@ from codeplag.cplag.const import IGNORE, OPERATORS -# TODO: iterate by iterotor retruned by tree.get_children() def get_not_ignored(tree: Cursor, src: str) -> List[Cursor]: ''' Function helps to discard unnecessary nodes such as imports ''' - children = list(tree.get_children()) - length = len(children) parsed_nodes = [] - for i in range(length): - loc = children[i].location.file - if (str(loc).split('/')[-1] == src.split('/')[-1] - and children[i].kind not in IGNORE): - parsed_nodes.append(children[i]) + for child in tree.get_children(): + loc = child.location.file + if ( + str(loc).split('/')[-1] == src.split('/')[-1] and + child.kind not in IGNORE + ): + parsed_nodes.append(child) return parsed_nodes @@ -62,22 +61,15 @@ def generic_visit(node, features: ASTFeatures, curr_depth: int = 0) -> None: def get_features(tree: Cursor, filepath: str = '') -> ASTFeatures: features = ASTFeatures(filepath) for token in tree.get_tokens(): - if (token.kind == TokenKind.PUNCTUATION and - token.spelling in OPERATORS): - if token.spelling not in features.operators: - features.operators[token.spelling] = 1 - else: - features.operators[token.spelling] += 1 - if (token.kind == TokenKind.KEYWORD): - if token.spelling not in features.keywords: - features.keywords[token.spelling] = 1 - else: - features.keywords[token.spelling] += 1 - if (token.kind == TokenKind.LITERAL): - if token.spelling not in features.literals: - features.literals[token.spelling] = 1 - else: - features.literals[token.spelling] += 1 + if ( + token.kind == TokenKind.PUNCTUATION and + token.spelling in OPERATORS + ): + features.operators[token.spelling] += 1 + if token.kind == TokenKind.KEYWORD: + features.keywords[token.spelling] += 1 + if token.kind == TokenKind.LITERAL: + features.literals[token.spelling] += 1 generic_visit(tree, features) diff --git a/src/codeplag/logger.py b/src/codeplag/logger.py index de0cb12..649c23d 100644 --- a/src/codeplag/logger.py +++ b/src/codeplag/logger.py @@ -21,9 +21,9 @@ def get_file_handler(filename: str) -> logging.FileHandler: def get_stream_handler() -> logging.StreamHandler: log_format = ('%(asctime)s - [%(levelname)s] - %(message)s') log_arguments = { - 'fmt': log_format, - 'datefmt': '%b %-d %T' - } + 'fmt': log_format, + 'datefmt': '%b %-d %T' + } stream_handler = logging.StreamHandler(stream=sys.stdout) stream_handler.setLevel(logging.INFO) stream_handler.setFormatter(logging.Formatter(**log_arguments)) diff --git a/src/codeplag/pyplag/astwalkers.py b/src/codeplag/pyplag/astwalkers.py index a574970..b66187e 100644 --- a/src/codeplag/pyplag/astwalkers.py +++ b/src/codeplag/pyplag/astwalkers.py @@ -3,6 +3,7 @@ from codeplag.astfeatures import ASTFeatures from codeplag.pyplag.const import (IGNORE_NODES, KEYWORDS, LITERALS, OPERATORS, TO_TOKEN) +from codeplag.types import NodeCodePlace, NodeStructurePlace class ASTWalker(ast.NodeVisitor): @@ -17,8 +18,12 @@ def add_unique_node(self, node_name: str) -> None: self.features.count_unodes += 1 def add_node_to_structure(self, node_name: str) -> None: - self.features.structure.append((self.curr_depth, - self.features.unodes[node_name])) + self.features.structure.append( + NodeStructurePlace( + depth=self.curr_depth, + uid=self.features.unodes[node_name] + ) + ) if self.curr_depth == 1: self.features.head_nodes.append(node_name) @@ -32,26 +37,21 @@ def generic_visit(self, node: ast.AST) -> None: if type_name in TO_TOKEN: self.features.tokens.append(TO_TOKEN[type_name]) if 'lineno' in dir(node) and 'col_offset' in dir(node): - self.features.tokens_pos.append((node.lineno, - node.col_offset)) + self.features.tokens_pos.append( + NodeCodePlace( + lineno=node.lineno, + col_offset=node.col_offset + ) + ) else: self.features.tokens_pos.append(self.features.tokens_pos[-1]) if type_name in OPERATORS: - if type_name not in self.features.operators: - self.features.operators[type_name] = 1 - else: - self.features.operators[type_name] += 1 + self.features.operators[type_name] += 1 elif type_name in KEYWORDS: - if type_name not in self.features.keywords: - self.features.keywords[type_name] = 1 - else: - self.features.keywords[type_name] += 1 + self.features.keywords[type_name] += 1 elif type_name in LITERALS: - if type_name not in self.features.literals: - self.features.literals[type_name] = 1 - else: - self.features.literals[type_name] += 1 + self.features.literals[type_name] += 1 if type_name not in IGNORE_NODES: if self.curr_depth != 0: diff --git a/src/codeplag/pyplag/utils.py b/src/codeplag/pyplag/utils.py index 43e7e44..be69531 100644 --- a/src/codeplag/pyplag/utils.py +++ b/src/codeplag/pyplag/utils.py @@ -1,6 +1,6 @@ import ast import os -from typing import List, Union +from typing import List, Optional from termcolor import colored @@ -13,7 +13,7 @@ logger = get_logger(__name__, LOG_PATH) -def get_ast_from_content(code: str, path: str) -> Union[ast.Module, None]: +def get_ast_from_content(code: str, path: str) -> Optional[ast.Module]: tree = None # TODO: Add logging and check for correct colored output @@ -49,7 +49,7 @@ def get_ast_from_content(code: str, path: str) -> Union[ast.Module, None]: return tree -def get_ast_from_filename(filename: str) -> Union[ast.Module, None]: +def get_ast_from_filename(filename: str) -> Optional[ast.Module]: ''' Function return ast which has type ast.Module @param filename - full path to file with code which will have diff --git a/src/codeplag/types.py b/src/codeplag/types.py new file mode 100644 index 0000000..71bebe8 --- /dev/null +++ b/src/codeplag/types.py @@ -0,0 +1,41 @@ +from typing import Dict, List, NamedTuple, TypedDict + +import numpy as np + + +class NodeCodePlace(NamedTuple): + lineno: int + col_offset: int + + +class NodeStructurePlace(NamedTuple): + depth: int + uid: int + + +class FastMetrics(NamedTuple): + jakkar: float + operators: float + keywords: float + literals: float + weighted_average: float + + +class StructuresInfo(NamedTuple): + similarity: float + compliance_matrix: np.array + + +class CompareInfo(NamedTuple): + fast: FastMetrics + structure: StructuresInfo = None + + +class WorksReport(TypedDict): + date: str + first_path: str + second_path: str + first_heads: List[str] + second_heads: List[str] + fast: Dict[str, int] + structure: dict diff --git a/src/codeplag/utils.py b/src/codeplag/utils.py index b64ec45..0567351 100644 --- a/src/codeplag/utils.py +++ b/src/codeplag/utils.py @@ -1,9 +1,12 @@ +import json import logging import os import re import sys +import uuid +from datetime import datetime from time import perf_counter -from typing import List, NamedTuple, Tuple +from typing import List import argcomplete import numpy as np @@ -14,41 +17,25 @@ from codeplag.algorithms.tokenbased import value_jakkar_coef from codeplag.astfeatures import ASTFeatures from codeplag.codeplagcli import CodeplagCLI -from codeplag.consts import FILE_DOWNLOAD_PATH, SUPPORTED_EXTENSIONS +from codeplag.consts import (FILE_DOWNLOAD_PATH, GET_FRAZE, LOG_PATH, + SUPPORTED_EXTENSIONS) from codeplag.cplag.const import COMPILE_ARGS from codeplag.cplag.tree import get_features as get_features_cpp from codeplag.cplag.util import \ get_cursor_from_file as get_cursor_from_file_cpp from codeplag.cplag.util import \ get_works_from_filepaths as get_works_from_filepaths_cpp +from codeplag.logger import get_logger from codeplag.pyplag.utils import \ get_ast_from_content as get_ast_from_content_py from codeplag.pyplag.utils import \ get_features_from_ast as get_features_from_ast_py from codeplag.pyplag.utils import \ get_works_from_filepaths as get_works_from_filepaths_py +from codeplag.types import (CompareInfo, FastMetrics, NodeCodePlace, + StructuresInfo, WorksReport) from webparsers.github_parser import GitHubParser -GET_FRAZE = 'Getting works features from' - - -class FastMetrics(NamedTuple): - jakkar: float - operators: float - keywords: float - literals: float - weighted_average: float - - -class StructuresInfo(NamedTuple): - similarity: float - compliance_matrix: np.array - - -class CompareInfo(NamedTuple): - fast: FastMetrics - structure: StructuresInfo = None - class Colors: HEADER = '\033[95m' @@ -183,8 +170,8 @@ def print_compare_result(features1: ASTFeatures, compare_info.structure.compliance_matrix.shape[1] ): data[row][col] = ( - compare_info.structure.compliance_matrix[row][col][0] / - compare_info.structure.compliance_matrix[row][col][1] + compare_info.structure.compliance_matrix[row][col][0] + / compare_info.structure.compliance_matrix[row][col][1] ) compliance_matrix_df = pd.DataFrame( data=data, @@ -224,7 +211,7 @@ def get_files_path_from_directory(directory: str, def print_suspect_parts(source_code: str, marked_tokens: List[int], - tokens_pos: List[Tuple[int, int]], + tokens_pos: List[NodeCodePlace], color: str = Colors.FAIL) -> None: ROWS = { row for (row, _column) in @@ -232,7 +219,7 @@ def print_suspect_parts(source_code: str, } row = 1 - column = 1 + column = 1 # noqa for symbol in source_code: if symbol == '\n': @@ -247,13 +234,13 @@ def print_suspect_parts(source_code: str, def print_code_and_highlight_suspect(source_code: str, marked_tokens: List[int], - tokens_pos: List[Tuple[int, int]], + tokens_pos: List[NodeCodePlace], color=Colors.FAIL) -> None: ROWS = {row for (row, column) in [tokens_pos[index] for index in marked_tokens]} row = 1 - column = 1 + column = 1 # noqa for symbol in source_code: if symbol == '\n': @@ -271,9 +258,9 @@ def print_code_and_highlight_suspect(source_code: str, class CodeplagEngine: def __init__(self, logger: logging.Logger) -> None: - self.logger = logger + self.logger: logging.Logger = logger - self.parser = CodeplagCLI() + self.parser: CodeplagCLI = CodeplagCLI() argcomplete.autocomplete(self.parser) def set_access_token(self, env_path: str) -> None: @@ -282,14 +269,24 @@ def set_access_token(self, env_path: str) -> None: "Env file not found or not a file. " "Trying to get token from environment." ) - self.access_token = os.environ.get('ACCESS_TOKEN', '') + self.access_token: str = os.environ.get('ACCESS_TOKEN', '') else: env_config = Config(RepositoryEnv(env_path)) - self.access_token = env_config.get('ACCESS_TOKEN', default='') + self.access_token: str = env_config.get('ACCESS_TOKEN', default='') if not self.access_token: self.logger.warning('GitHub access token is not defined.') + def set_github_parser(self, branch_policy: bool) -> None: + self.github_parser = GitHubParser( + file_extensions=SUPPORTED_EXTENSIONS[ + self.extension + ], + check_policy=branch_policy, + access_token=self.access_token, + logger=get_logger('webparsers', LOG_PATH) + ) + def append_work_features(self, file_content: str, url_to_file: str) -> None: @@ -377,6 +374,38 @@ def get_works_from_users_repos(self, for file_content, url_file in files: self.append_work_features(file_content, url_file) + def save_result(self, + first_work: ASTFeatures, + second_work: ASTFeatures, + metrics: CompareInfo, + reports_dir: str) -> None: + # TODO: use TypedDict + struct_info_dict = metrics.structure._asdict() + struct_info_dict['compliance_matrix'] = ( + struct_info_dict['compliance_matrix'].tolist() + ) + report = WorksReport( + date=datetime.now().strftime("%d/%m/%Y %H:%M:%S"), + first_path=first_work.filepath, + second_path=second_work.filepath, + first_heads=first_work.head_nodes, + second_heads=second_work.head_nodes, + fast=metrics.fast._asdict(), + structure=struct_info_dict + ) + try: + report_file = f'{reports_dir}/{uuid.uuid4().hex}.json' + with open(report_file, 'w') as f: + f.write(json.dumps(report)) + except PermissionError: + self.logger.warning( + "Not enough rights to write reports to the folder." + ) + except FileNotFoundError: + self.logger.warning( + "Provided folder for reports now is not exists." + ) + def run(self, args: List[str] = None) -> None: self.logger.debug("Starting codeplag util") @@ -385,7 +414,15 @@ def run(self, args: List[str] = None) -> None: parsed_args = vars(self.parser.parse_args(args)) self.set_access_token(parsed_args.get('environment')) - self.extension = parsed_args.get('extension') + self.extension: str = parsed_args.get('extension') + if any( + [ + parsed_args.get('github_files'), + parsed_args.get('github_project_folders'), + parsed_args.get('github_user') + ] + ): + self.set_github_parser(parsed_args.get('all_branches')) self.logger.debug( f"Mode: {parsed_args['mode']}; " @@ -395,14 +432,7 @@ def run(self, args: List[str] = None) -> None: begin_time = perf_counter() if parsed_args.get('mode') == 'many_to_many': - self.works = [] - self.github_parser = GitHubParser( - file_extensions=SUPPORTED_EXTENSIONS[ - self.extension - ], - check_policy=parsed_args.get('all_branches'), - access_token=self.access_token - ) + self.works: List[ASTFeatures] = [] self.get_works_from_files(parsed_args.get('files')) self.get_works_from_dirs(parsed_args.get('directories')) @@ -420,13 +450,14 @@ def run(self, args: List[str] = None) -> None: self.logger.info("Starting searching for plagiarism") count_works = len(self.works) iterations = int((count_works * (count_works - 1)) / 2) - iteration = 1 + iteration = 0 for i, work1 in enumerate(self.works): for j, work2 in enumerate(self.works): if i <= j: continue if parsed_args.get('show_progress'): + iteration += 1 print( f"Check progress: {iteration / iterations:.2%}.", end='\r' @@ -437,15 +468,22 @@ def run(self, args: List[str] = None) -> None: work2, parsed_args.get('threshold') ) - if metrics.structure: - print_compare_result( + if not metrics.structure: + continue + + print_compare_result( + work1, + work2, + metrics, + parsed_args.get('threshold') + ) + if parsed_args.get('reports_directory'): + self.save_result( work1, work2, metrics, - parsed_args.get('threshold') + parsed_args.get('reports_directory') ) - iteration += 1 - self.logger.debug(f'Time for all {perf_counter() - begin_time:.2f} s') self.logger.info("Ending searching for plagiarism.") diff --git a/src/webparsers/consts.py.in b/src/webparsers/consts.py.in deleted file mode 100644 index d5be432..0000000 --- a/src/webparsers/consts.py.in +++ /dev/null @@ -1,2 +0,0 @@ -DEFAULT_FILE_EXTENSIONS = [r'.py$', r'.cpp$', r'.c$', r'.h$'] -LOG_PATH = "@WEBPARSERS_LOG_PATH@" diff --git a/src/webparsers/github_parser.py b/src/webparsers/github_parser.py index 42ea1f9..17ba512 100644 --- a/src/webparsers/github_parser.py +++ b/src/webparsers/github_parser.py @@ -1,26 +1,23 @@ import base64 +import logging import re import sys -from typing import List, Literal +from typing import List, Literal, Optional import requests -from webparsers.consts import DEFAULT_FILE_EXTENSIONS, LOG_PATH -from webparsers.logger import get_logger - class GitHubUrl(str): def __new__(cls, url: str): url_parts = url.rstrip('/').split('/') error_msg = f"'{url}' is incorrect link to GitHub" - if len(url_parts) < 3: - raise ValueError(error_msg) - if url_parts[0] != 'https:' and url_parts[0] != 'http:': - raise ValueError(error_msg) - elif url_parts[1] != '': - raise ValueError(error_msg) - elif url_parts[2] != 'github.com': + if ( + len(url_parts) < 3 or + (url_parts[0] != 'https:' and url_parts[0] != 'http:') or + url_parts[1] != '' or + url_parts[2] != 'github.com' + ): raise ValueError(error_msg) obj = str.__new__(cls, url) @@ -61,19 +58,19 @@ def __new__(cls, url: str): class GitHubParser: - def __init__(self, file_extensions: List[str] = None, + def __init__(self, file_extensions: Optional[List[str]] = None, check_policy: Literal[0, 1] = 0, access_token: str = '', - log_path: str = LOG_PATH): - self.logger = get_logger(__name__, log_path) - if file_extensions is None: - self.__file_extensions = DEFAULT_FILE_EXTENSIONS + logger: Optional[logging.Logger] = None): + if logger is None: + self.logger = logging.getLogger(__name__) else: - self.__file_extensions = file_extensions + self.logger = logger + self.__file_extensions = file_extensions self.__access_token = access_token self.__check_all_branches = check_policy - def decode_file_content(self, file_in_bytes): + def decode_file_content(self, file_in_bytes: bytes) -> str: attempt = 1 code = None while code is None: @@ -89,14 +86,17 @@ def decode_file_content(self, file_in_bytes): return code - def is_accepted_extension(self, path): - for extension in self.__file_extensions: - if re.search(extension, path): - return True + def is_accepted_extension(self, path: str) -> bool: + if self.__file_extensions is None: + return True - return False + return any( + re.search(extension, path) for extension in self.__file_extensions + ) - def send_get_request(self, api_url, params=None): + def send_get_request(self, + api_url: str, + params: dict = None) -> requests.Response: if params is None: params = {} @@ -156,17 +156,18 @@ def get_list_of_repos(self, owner, per_page=100, reg_exp=None): 'page': page } response_json = self.send_get_request( - api_url, - params=params - ).json() + api_url, + params=params + ).json() if len(response_json) == 0: break for repo in response_json: - if reg_exp is None: - repos[repo['name']] = repo['html_url'] - elif re.search(reg_exp, repo['name']) is not None: + if ( + (reg_exp is None) or + re.search(reg_exp, repo['name']) is not None + ): repos[repo['name']] = repo['html_url'] page += 1 @@ -203,16 +204,16 @@ def get_files_generator_from_sha_commit(self, owner, repo, branch, current_path = f"{path}/{node['path']}" if node["type"] == "tree": yield from self.get_files_generator_from_sha_commit( - owner, - repo, - branch, - node['sha'], - current_path - ) + owner, + repo, + branch, + node['sha'], + current_path + ) if node["type"] == "blob" and self.is_accepted_extension( - current_path - ): + current_path + ): file_link = ( "https://github.com/" f"{owner}/{repo}/blob/{branch}{current_path}" @@ -260,20 +261,21 @@ def get_files_generator_from_repo_url(self, repo_url): repo_url.owner, repo_url.repo ) else: - branches = {default_branch: self.get_sha_last_branch_commit( - repo_url.owner, - repo_url.repo, - default_branch - ) - } + branches = { + default_branch: self.get_sha_last_branch_commit( + repo_url.owner, + repo_url.repo, + default_branch + ) + } for branch in branches.items(): yield from self.get_files_generator_from_sha_commit( - repo_url.owner, - repo_url.repo, - branch[0], - branch[1] - ) + repo_url.owner, + repo_url.repo, + branch[0], + branch[1] + ) def get_file_from_url(self, file_url): try: @@ -293,11 +295,11 @@ def get_file_from_url(self, file_url): response_json = self.send_get_request(api_url, params=params).json() return self.get_file_content_from_sha( - file_url.owner, - file_url.repo, - response_json['sha'], - file_url - ) + file_url.owner, + file_url.repo, + response_json['sha'], + file_url + ) def get_files_generator_from_dir_url(self, dir_url): try: @@ -320,23 +322,23 @@ def get_files_generator_from_dir_url(self, dir_url): current_path = "./" + node["path"] if node["type"] == "dir": yield from self.get_files_generator_from_sha_commit( - dir_url.owner, - dir_url.repo, - dir_url.branch, - node['sha'], - current_path - ) + dir_url.owner, + dir_url.repo, + dir_url.branch, + node['sha'], + current_path + ) if node["type"] == "file" and self.is_accepted_extension( - node["name"] - ): + node["name"] + ): file_link = ( 'https://github.com/' f'{dir_url.owner}/{dir_url.repo}' f'/tree/{dir_url.branch}/{current_path[2:]}' ) yield self.get_file_content_from_sha( - dir_url.owner, - dir_url.repo, - node["sha"], - file_link - ) + dir_url.owner, + dir_url.repo, + node["sha"], + file_link + ) diff --git a/src/webparsers/logger.py b/src/webparsers/logger.py deleted file mode 100644 index de0cb12..0000000 --- a/src/webparsers/logger.py +++ /dev/null @@ -1,40 +0,0 @@ -import logging -import sys - - -def get_file_handler(filename: str) -> logging.FileHandler: - log_format = ( - '%(asctime)s - [%(levelname)s] - %(name)s - ' - '(%(filename)s).%(funcName)s(%(lineno)d) - %(message)s' - ) - log_arguments = { - 'fmt': log_format, - 'datefmt': '%b %-d %T' - } - file_handler = logging.FileHandler(filename) - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(logging.Formatter(**log_arguments)) - - return file_handler - - -def get_stream_handler() -> logging.StreamHandler: - log_format = ('%(asctime)s - [%(levelname)s] - %(message)s') - log_arguments = { - 'fmt': log_format, - 'datefmt': '%b %-d %T' - } - stream_handler = logging.StreamHandler(stream=sys.stdout) - stream_handler.setLevel(logging.INFO) - stream_handler.setFormatter(logging.Formatter(**log_arguments)) - - return stream_handler - - -def get_logger(name: str, filename: str) -> logging.Logger: - logger = logging.getLogger(name) - logger.setLevel(logging.DEBUG) - logger.addHandler(get_file_handler(filename)) - logger.addHandler(get_stream_handler()) - - return logger diff --git a/test/auto/test_bugs.py b/test/auto/test_bugs.py index 531a0ac..a6e555f 100644 --- a/test/auto/test_bugs.py +++ b/test/auto/test_bugs.py @@ -1,6 +1,6 @@ import re -from utils import SUCCESS_CODE, run_util +from utils import SUCCESS_CODE, run_cmd, run_util def test_log_once(): @@ -18,3 +18,9 @@ def test_log_once(): handled_stdout = re.sub(pattern, "", output_result, count=1) assert pattern not in handled_stdout + + +def test_man_unminimized(): + result = run_cmd(['dpkg-divert', '--truename', '/usr/bin/man']) + + assert result.stdout.decode('utf-8').strip() == '/usr/bin/man' diff --git a/test/auto/test_functional.py b/test/auto/test_functional.py index 8ed8fa7..13013c2 100644 --- a/test/auto/test_functional.py +++ b/test/auto/test_functional.py @@ -1,7 +1,14 @@ +import json +import os +import re +import shutil +from contextlib import suppress + import pytest +from utils import SUCCESS_CODE, run_util from codeplag.consts import UTIL_NAME, UTIL_VERSION -from utils import SUCCESS_CODE, run_util +from codeplag.types import WorksReport CPP_FILES = [ 'test/unit/codeplag/cplag/data/sample1.cpp', @@ -14,6 +21,7 @@ CPP_DIR = 'test/unit/codeplag/cplag/data' PY_DIR = 'test/unit/codeplag/cplag' REPO_URL = 'https://github.com/OSLL/code-plagiarism' +REPORTS_FOLDER = os.path.abspath('./reports') CPP_GITHUB_FILES = [ f'{REPO_URL}/blob/main/test/unit/codeplag/cplag/data/sample3.cpp', f'{REPO_URL}/blob/main/test/unit/codeplag/cplag/data/sample4.cpp' @@ -92,3 +100,25 @@ def test_compare_py_files(cmd, out): assert result.returncode == SUCCESS_CODE assert out in result.stdout + + +def test_save_reports(): + with suppress(Exception): + os.mkdir(REPORTS_FOLDER) + assert os.path.exists(REPORTS_FOLDER) + + run_util( + ['--directories', './test', '--reports_directory', REPORTS_FOLDER] + ) + reports_files = os.listdir(REPORTS_FOLDER) + + assert len(reports_files) > 0 + for file in reports_files: + assert re.search('.*[.]json$', file) + filepath = f'{REPORTS_FOLDER}/{file}' + with open(filepath, 'r') as f: + report = json.loads(f.read()) + for key in WorksReport.__annotations__.keys(): + assert key in report + + shutil.rmtree(REPORTS_FOLDER) diff --git a/test/auto/utils.py b/test/auto/utils.py index ad4fede..aa9dea9 100644 --- a/test/auto/utils.py +++ b/test/auto/utils.py @@ -5,8 +5,9 @@ SUCCESS_CODE = 0 +def run_cmd(cmd): + return subprocess.run(cmd, stdout=subprocess.PIPE) + + def run_util(cmd, ext='py'): - return subprocess.run( - [UTIL_NAME] + ['--extension', ext] + cmd, - stdout=subprocess.PIPE - ) + return run_cmd([UTIL_NAME] + ['--extension', ext] + cmd) diff --git a/test/unit/codeplag/algorithms/test_tokenbased.py b/test/unit/codeplag/algorithms/test_tokenbased.py index ea2bf1d..829f790 100644 --- a/test/unit/codeplag/algorithms/test_tokenbased.py +++ b/test/unit/codeplag/algorithms/test_tokenbased.py @@ -37,21 +37,21 @@ def test_generate_ngrams_and_hashit(self): for_bigrams = [1, 2, 3, 4, 5] res1 = generate_ngrams(for_bigrams, 2, hashit=True) wait1 = [ - hash(tuple(for_bigrams[i:i+2])) + hash(tuple(for_bigrams[i:i + 2])) for i in range(len(for_bigrams) - 1) ] for_trigrams = [3, 4, 7, 8, 15, 3] res2 = generate_ngrams(for_trigrams, 3, hashit=True) wait2 = [ - hash(tuple(for_trigrams[i:i+3])) + hash(tuple(for_trigrams[i:i + 3])) for i in range(len(for_trigrams) - 2) ] for_fourgrams = [1, 3, 5, 7, 9, 7, 5] res3 = generate_ngrams(for_fourgrams, 4, hashit=True) wait3 = [ - hash(tuple(for_fourgrams[i:i+4])) + hash(tuple(for_fourgrams[i:i + 4])) for i in range(len(for_fourgrams) - 3) ] @@ -63,7 +63,7 @@ def test_generate_unique_ngrams_and_hashit(self): for_bigrams = [1, 2, 2, 2, 5] res1 = generate_ngrams(for_bigrams, 2, unique=True, hashit=True) wait1 = { - hash(tuple(for_bigrams[i:i+2])) + hash(tuple(for_bigrams[i:i + 2])) for i in range(len(for_bigrams) - 1) } @@ -72,7 +72,7 @@ def test_generate_unique_ngrams_and_hashit(self): for_trigrams, 3, unique=True, hashit=True ) wait2 = { - hash(tuple(for_trigrams[i:i+3])) + hash(tuple(for_trigrams[i:i + 3])) for i in range(len(for_trigrams) - 2) } @@ -81,7 +81,7 @@ def test_generate_unique_ngrams_and_hashit(self): for_fourgrams, 4, unique=True, hashit=True ) wait3 = { - hash(tuple(for_fourgrams[i:i+4])) + hash(tuple(for_fourgrams[i:i + 4])) for i in range(len(for_fourgrams) - 3) } diff --git a/test/unit/codeplag/cplag/test_tree.py b/test/unit/codeplag/cplag/test_tree.py index cc3e394..ca577e7 100644 --- a/test/unit/codeplag/cplag/test_tree.py +++ b/test/unit/codeplag/cplag/test_tree.py @@ -1,6 +1,8 @@ import os import unittest +from clang.cindex import CursorKind + from codeplag.astfeatures import ASTFeatures from codeplag.cplag.const import COMPILE_ARGS from codeplag.cplag.tree import generic_visit, get_features, get_not_ignored @@ -29,10 +31,38 @@ def test_get_not_ignored_normal(self): res1 = get_not_ignored(self.first_cursor, self.first_sample_path) res2 = get_not_ignored(self.second_cursor, self.second_sample_path) - self.assertEqual(type(res1), list) - self.assertEqual(type(res2), list) - self.assertEqual(len(res1), 1) - self.assertEqual(len(res2), 1) + main_node = res1[0] + assert main_node.spelling == 'gcd' + assert main_node.kind == CursorKind.FUNCTION_DECL + + children = main_node.get_children() + expected_res = [ + ('l', CursorKind.PARM_DECL), + ('r', CursorKind.PARM_DECL), + ('', CursorKind.COMPOUND_STMT) + ] + for index, child in enumerate(children): + assert expected_res[index][0] == child.spelling + assert expected_res[index][1] == child.kind + + main_node = res2[0] + assert main_node.spelling == 'gcd' + assert main_node.kind == CursorKind.FUNCTION_DECL + + children = main_node.get_children() + expected_res = [ + ('a', CursorKind.PARM_DECL), + ('b', CursorKind.PARM_DECL), + ('', CursorKind.COMPOUND_STMT) + ] + for index, child in enumerate(children): + assert expected_res[index][0] == child.spelling + assert expected_res[index][1] == child.kind + + assert type(res1) == list + assert type(res2) == list + assert len(res1) == 1 + assert len(res2) == 1 def test_generic_visit(self): features = ASTFeatures(self.first_sample_path) diff --git a/test/unit/codeplag/pyplag/test_astwalkers.py b/test/unit/codeplag/pyplag/test_astwalkers.py index d6520a5..0a973e6 100644 --- a/test/unit/codeplag/pyplag/test_astwalkers.py +++ b/test/unit/codeplag/pyplag/test_astwalkers.py @@ -13,25 +13,28 @@ class TestASTWalkers(unittest.TestCase): def test_astwalker_class_normal(self): + path = os.path.join(pwd, './data/test1.py') tree = get_ast_from_filename(os.path.join(pwd, './data/test1.py')) - features = ASTFeatures() + features = ASTFeatures(path) walker = ASTWalker(features) walker.visit(tree) - operators = {} - operators['AugAssign'] = np.int64(1) - operators['Add'] = np.int64(1) - keywords = {} - keywords['FunctionDef'] = np.int64(1) - keywords['Return'] = np.int64(1) - keywords['If'] = np.int64(1) - literals = {} - - # ast.Constant с python >= 3.8 используется для всех констант - # до этого были NameConstant, Num и др. - literals['Constant'] = np.int64(3) - - file_literals = {} - file_literals['Constant'] = 0 + operators = { + 'AugAssign': np.int64(1), + 'Add': np.int64(1) + } + keywords = { + 'FunctionDef': np.int64(1), + 'Return': np.int64(1), + 'If': np.int64(1) + } + literals = { + # ast.Constant с python >= 3.8 use for all constants + # before was NameConstant, Num and etc. + 'Constant': np.int64(3) + } + file_literals = { + 'Constant': 0, + } if 'Constant' in features.literals: file_literals['Constant'] = features.literals['Constant'] unodes = 13 diff --git a/test/unit/codeplag/test_utils.py b/test/unit/codeplag/test_utils.py index 23bab1f..3792911 100644 --- a/test/unit/codeplag/test_utils.py +++ b/test/unit/codeplag/test_utils.py @@ -1,9 +1,12 @@ +import logging import os +import re +from unittest.mock import call import pytest from codeplag.pyplag.utils import get_ast_from_filename, get_features_from_ast -from codeplag.utils import (compare_works, fast_compare, +from codeplag.utils import (CodeplagEngine, compare_works, fast_compare, get_files_path_from_directory) CWD = os.path.dirname(os.path.abspath(__file__)) @@ -69,3 +72,49 @@ def test_get_files_path_from_directory(): assert os.path.join(CWD, 'test_utils.py') in files assert os.path.join(CWD, 'data/test1.py') in files assert os.path.join(CWD, 'data/test2.py') in files + + +def test_save_result(mocker): + mocker.patch('logging.Logger') + code_engine = CodeplagEngine(logging.Logger) + tree1 = get_ast_from_filename(os.path.join(CWD, './data/test1.py')) + tree2 = get_ast_from_filename(os.path.join(CWD, './data/test2.py')) + features1 = get_features_from_ast(tree1) + features2 = get_features_from_ast(tree2) + compare_info = compare_works(features1, features2) + + mocker.patch('builtins.open', side_effect=FileNotFoundError) + code_engine.save_result( + features1, + features2, + compare_info, + '/bad_dir' + ) + open.assert_called_once() + assert logging.Logger.warning.call_args == call( + 'Provided folder for reports now is not exists.' + ) + + mocker.patch('builtins.open', side_effect=PermissionError) + code_engine.save_result( + features1, + features2, + compare_info, + '/etc' + ) + open.assert_called_once() + assert logging.Logger.warning.call_args == call( + 'Not enough rights to write reports to the folder.' + ) + + mocker.patch('builtins.open') + code_engine.save_result( + features1, + features2, + compare_info, + './src' + ) + + open.assert_called_once() + assert re.search('./src/.*[.]json$', open.call_args[0][0]) + assert re.search('w', open.call_args[0][1]) diff --git a/test/unit/webparsers/test_github_parser.py b/test/unit/webparsers/test_github_parser.py index 14caa5e..d7d3348 100644 --- a/test/unit/webparsers/test_github_parser.py +++ b/test/unit/webparsers/test_github_parser.py @@ -37,11 +37,12 @@ def test_decode_file_content(self): for test_case in test_cases: buf = io.StringIO() - with redirect_stdout(buf): - with self.subTest(test_case=test_case): - gh_parser = GitHubParser() - result = gh_parser.decode_file_content(**test_case['arguments']) - self.assertEqual(result, test_case['expected_result']) + with redirect_stdout(buf), self.subTest(test_case=test_case): + gh_parser = GitHubParser() + result = gh_parser.decode_file_content( + **test_case['arguments'] + ) + self.assertEqual(result, test_case['expected_result']) def test_is_accepted_extension(self): test_cases = [ @@ -91,7 +92,9 @@ def test_is_accepted_extension(self): for test_case in test_cases: with self.subTest(test_case=test_case): - rv = test_case['parser'].is_accepted_extension(**test_case['arguments']) + rv = test_case['parser'].is_accepted_extension( + **test_case['arguments'] + ) self.assertEqual(rv, test_case['expected_result']) @patch('webparsers.github_parser.requests.get') @@ -389,7 +392,10 @@ def json(self): rv = parser.get_list_of_repos(**test_case['arguments']) self.assertEqual(rv, test_case['expected_result']) - self.assertEqual(mock_send_get_request.mock_calls, test_case['send_calls']) + self.assertEqual( + mock_send_get_request.mock_calls, + test_case['send_calls'] + ) @patch('webparsers.github_parser.GitHubParser.send_get_request') def test_get_name_default_branch(self, mock_send_get_request): @@ -434,7 +440,10 @@ def json(self): rv = parser.get_name_default_branch(**test_case['arguments']) self.assertEqual(rv, test_case['expected_result']) - self.assertEqual(mock_send_get_request.mock_calls, test_case['send_calls']) + self.assertEqual( + mock_send_get_request.mock_calls, + test_case['send_calls'] + ) @patch('webparsers.github_parser.GitHubParser.send_get_request') def test_get_sha_last_branch_commit(self, mock_send_get_request): @@ -546,12 +555,11 @@ def json(self): mock_send_get_request.return_value = test_case['send_rv'] buf = io.StringIO() - with redirect_stdout(buf): - with self.subTest(test_case=test_case): - rv = parser.get_file_content_from_sha(**test_case['arguments']) - self.assertEqual(rv, test_case['expected_result']) + with redirect_stdout(buf), self.subTest(test_case=test_case): + rv = parser.get_file_content_from_sha(**test_case['arguments']) + self.assertEqual(rv, test_case['expected_result']) - self.assertEqual(mock_send_get_request.mock_calls, test_case['send_calls']) + self.assertEqual(mock_send_get_request.mock_calls, test_case['send_calls']) @patch('webparsers.github_parser.GitHubParser.get_file_content_from_sha') @patch('webparsers.github_parser.GitHubParser.send_get_request') @@ -782,12 +790,11 @@ def json(self): mock_send_get_request.side_effect = test_case['send_se'] buf = io.StringIO() - with redirect_stdout(buf): - with self.subTest(test_case=test_case): - rv = parser.get_list_repo_branches(**test_case['arguments']) - self.assertEqual(rv, test_case['expected_result']) + with redirect_stdout(buf), self.subTest(test_case=test_case): + rv = parser.get_list_repo_branches(**test_case['arguments']) + self.assertEqual(rv, test_case['expected_result']) - self.assertEqual(mock_send_get_request.mock_calls, test_case['send_calls']) + self.assertEqual(mock_send_get_request.mock_calls, test_case['send_calls']) @patch('webparsers.github_parser.GitHubParser.get_name_default_branch') @patch('webparsers.github_parser.GitHubParser.get_list_repo_branches')