Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add workflows for testing and linting - part 2 #4

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"

29 changes: 29 additions & 0 deletions .github/workflows/code-style.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Code Style Check

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
lint:
runs-on: ubuntu-latest

steps:
- name: Check out the code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: "3.10"

- name: Install flake8
run: pip install flake8

- name: Run flake8
run: flake8 crivo tests

35 changes: 35 additions & 0 deletions .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Python Tests

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
test:
runs-on: ubuntu-latest

strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]

steps:
- name: Check out the code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-dev.txt

- name: Run tests
run: pytest --cov=crivo

4 changes: 2 additions & 2 deletions crivo/file_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

"""


def process_file(file_path):
"""
Reads the content of a text file and returns it as a string.
Expand All @@ -22,11 +23,10 @@ def process_file(file_path):
str: The content of the file.
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
return content
except FileNotFoundError:
raise FileNotFoundError(f"Error: The file '{file_path}' was not found.")
except IOError as e:
raise IOError(f"Error reading the file '{file_path}': {e}")

29 changes: 20 additions & 9 deletions crivo/filter_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import re
import os


def load_valid_tlds(file_path):
"""
Loads a list of valid TLDs from a file and constructs a regex pattern.
Expand All @@ -25,18 +26,27 @@ def load_valid_tlds(file_path):
str: A regex pattern matching valid TLDs.
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
with open(file_path, "r", encoding="utf-8") as f:
tlds = [line.strip() for line in f if line.strip()]
if not tlds:
raise ValueError("The TLD list is empty.")
# Join TLDs into a single regex group
return '|'.join(tlds)
return "|".join(tlds)
except FileNotFoundError:
raise FileNotFoundError(f"Error: The file '{file_path}' was not found.")
except IOError as e:
raise IOError(f"Error reading the file '{file_path}': {e}")

def filter_content(content, scope_filters=None, filter_ip=False, filter_ipv4=False, filter_ipv6=False, filter_domain=False, filter_url=False):

def filter_content(
content,
scope_filters=None,
filter_ip=False,
filter_ipv4=False,
filter_ipv6=False,
filter_domain=False,
filter_url=False,
):
"""
Filters the provided content based on specified parameters.

Expand All @@ -53,12 +63,12 @@ def filter_content(content, scope_filters=None, filter_ip=False, filter_ipv4=Fal
list: A list of unique filtered strings based on the criteria.
"""
# Load the TLD regex dynamically
tld_regex = load_valid_tlds('valid_tlds.txt')
domain_pattern = rf'\b(?:[a-zA-Z0-9-]+\.)+(?:{tld_regex})\b'
tld_regex = load_valid_tlds("valid_tlds.txt")
domain_pattern = rf"\b(?:[a-zA-Z0-9-]+\.)+(?:{tld_regex})\b"

# Other regex patterns
ipv4_pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
ipv6_pattern = r'\b(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}\b'
ipv4_pattern = r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b"
ipv6_pattern = r"\b(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}\b"
url_pattern = r'\bhttps?://[^\s<>"\'#]+\b'
results = set()

Expand All @@ -82,7 +92,8 @@ def filter_content(content, scope_filters=None, filter_ip=False, filter_ipv4=Fal

# Apply scope filtering if provided
if scope_filters:
results = {item for item in results if any(scope in item for scope in scope_filters)}
results = {
item for item in results if any(scope in item for scope in scope_filters)
}

return sorted(results)

85 changes: 70 additions & 15 deletions crivo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from crivo.filter_engine import filter_content
from crivo.version import __version__


def parse_arguments():
"""
Sets up the command-line arguments for Crivo.
Expand All @@ -28,22 +29,50 @@ def parse_arguments():
parser = argparse.ArgumentParser(
description="Crivo - A tool for extracting and filtering URLs, IPs, domains, and subdomains from text or web pages, with built-in web scraping capabilities."
)

parser.add_argument("-f", "--file", help="Input file with text to be filtered", type=str)
parser.add_argument("-w", "--webpage", help="URL of a webpage to have its content filtered", type=str)
parser.add_argument("-W", "--webpage-list", help="File containing a list of webpage URLs to be filtered", type=str)

parser.add_argument(
"-f", "--file", help="Input file with text to be filtered", type=str
)
parser.add_argument(
"-w",
"--webpage",
help="URL of a webpage to have its content filtered",
type=str,
)
parser.add_argument(
"-W",
"--webpage-list",
help="File containing a list of webpage URLs to be filtered",
type=str,
)
parser.add_argument("-o", "--output", help="Write the output to a file", type=str)
parser.add_argument("-s", "--scope", help="Comma-separated sequence of scope filters (ips, urls, domains, subdomains)", type=str)
parser.add_argument(
"-s",
"--scope",
help="Comma-separated sequence of scope filters (ips, urls, domains, subdomains)",
type=str,
)
parser.add_argument("--ip", help="Filter only IPs", action="store_true")
parser.add_argument("--ipv4", help="Filter only IPv4", action="store_true")
parser.add_argument("--ipv6", help="Filter only IPv6", action="store_true")
parser.add_argument("--domain", help="Filter only domains and subdomains", action="store_true")
parser.add_argument(
"--domain", help="Filter only domains and subdomains", action="store_true"
)
parser.add_argument("--url", help="Filter only URLs", action="store_true")
parser.add_argument("-v", "--verbose", help="Enable verbose output", action="store_true")
parser.add_argument("-V", "--version", help="Show the programme version", action="version", version=f"Crivo {__version__}")
parser.add_argument(
"-v", "--verbose", help="Enable verbose output", action="store_true"
)
parser.add_argument(
"-V",
"--version",
help="Show the programme version",
action="version",
version=f"Crivo {__version__}",
)

return parser.parse_args()


def main():
"""
Main logic for Crivo.
Expand All @@ -53,15 +82,27 @@ def main():
if args.file:
content = process_file(args.file)
filtered_output = filter_content(
content, args.scope.split(",") if args.scope else [], args.ip, args.ipv4, args.ipv6, args.domain, args.url
content,
args.scope.split(",") if args.scope else [],
args.ip,
args.ipv4,
args.ipv6,
args.domain,
args.url,
)
print("\n".join(filtered_output))

elif args.webpage:
print(args.webpage)
content = scrape_webpage(args.webpage)
filtered_output = filter_content(
content, args.scope.split(",") if args.scope else [], args.ip, args.ipv4, args.ipv6, args.domain, args.url
content,
args.scope.split(",") if args.scope else [],
args.ip,
args.ipv4,
args.ipv6,
args.domain,
args.url,
)
print("\n".join(filtered_output))

Expand All @@ -76,18 +117,32 @@ def main():
try:
content = scrape_webpage(url)
filtered_output = filter_content(
content, args.scope.split(",") if args.scope else [], args.ip, args.ipv4, args.ipv6, args.domain, args.url
content,
args.scope.split(",") if args.scope else [],
args.ip,
args.ipv4,
args.ipv6,
args.domain,
args.url,
)
print("\n".join(filtered_output)) # Print results directly after the URL
print(
"\n".join(filtered_output)
) # Print results directly after the URL
except Exception as e:
print(f"Error fetching content from {url}: {e}", file=sys.stderr)

except FileNotFoundError:
print(f"Error: The file '{args.webpage_list}' was not found.", file=sys.stderr)
print(
f"Error: The file '{args.webpage_list}' was not found.", file=sys.stderr
)
sys.exit(1)
else:
print("Error: An input file (-f), a webpage (-w), or a webpage list (-W) is required.", file=sys.stderr)
print(
"Error: An input file (-f), a webpage (-w), or a webpage list (-W) is required.",
file=sys.stderr,
)
sys.exit(1)



if __name__ == "__main__":
main()
4 changes: 3 additions & 1 deletion crivo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

"""


def normalise_text(text):
"""
Normalises text by stripping whitespace and converting to lowercase.
Expand All @@ -23,6 +24,7 @@ def normalise_text(text):
"""
return text.strip().lower()


def validate_scope(scope):
"""
Validates the provided scope filters and ensures they are in the correct format.
Expand All @@ -37,6 +39,7 @@ def validate_scope(scope):
return []
return [s.strip().lower() for s in scope.split(",") if s.strip()]


def log_verbose(message, verbose):
"""
Prints a message if verbose mode is enabled.
Expand All @@ -47,4 +50,3 @@ def log_verbose(message, verbose):
"""
if verbose:
print(message)

2 changes: 1 addition & 1 deletion crivo/web_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from urllib.parse import urljoin
import re


def scrape_webpage(url, resolve_links=False):
"""
Fetches the content of a webpage and optionally resolves relative links.
Expand Down Expand Up @@ -47,4 +48,3 @@ def scrape_webpage(url, resolve_links=False):
return content
except requests.exceptions.RequestException as e:
raise RuntimeError(f"Error fetching the webpage '{url}': {e}")

3 changes: 2 additions & 1 deletion crivo_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@

from crivo.main import main


def run_cli():
"""
Runs the Crivo command-line interface.
"""
main()


if __name__ == "__main__":
run_cli()

1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Testing framework
pytest>=7.0
pytest-mock>=3.0
pytest-cov>=3.0

# Code style and linting
flake8>=5.0
Expand Down
Loading
Loading