Skip to content
This repository has been archived by the owner on Sep 5, 2024. It is now read-only.

[WIP] Add match type and filter params #37

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions comcrawl/core/index_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import logging
from typing import Optional
from ..types import IndexList, ResultList
from ..utils import (
fetch_available_indexes,
Expand Down Expand Up @@ -50,7 +51,7 @@ def __init__(self,

self.results: ResultList = []

def search(self, url: str, threads: int = None) -> None:
def search(self, url: str, threads: int = Optional[None], filter: Optional[str]=None, match_type: Optional[str]=None) -> None:
"""Search.

Searches the Common Crawl indexes this class was
Expand All @@ -62,7 +63,7 @@ def search(self, url: str, threads: int = None) -> None:
multi-threading only if set.

"""
self.results = search_multiple_indexes(url, self.indexes, threads)
self.results = search_multiple_indexes(url, self.indexes, threads, filter, match_type)

def download(self, threads: int = None) -> None:
"""Download.
Expand Down
8 changes: 6 additions & 2 deletions comcrawl/utils/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"CC-MAIN-{index}-index?url={url}&output=json")


def search_single_index(index: str, url: str) -> ResultList:
def search_single_index(index: str, url: str, filter: str = None, match_type: str = None) -> ResultList:
"""Searches specific Common Crawl Index for given URL pattern.

Args:
Expand All @@ -28,6 +28,10 @@ def search_single_index(index: str, url: str) -> ResultList:
results: ResultList = []

url = URL_TEMPLATE.format(index=index, url=url)
if filter:
url += f"&filter={filter}"
if match_type:
url += f"&matchType={match_type}"
response = requests.get(url)

if response.status_code == 200:
Expand All @@ -40,7 +44,7 @@ def search_single_index(index: str, url: str) -> ResultList:

def search_multiple_indexes(url: str,
indexes: IndexList,
threads: int = None) -> ResultList:
threads: int = None, filter: str = None, match_type: str = None) -> ResultList:
"""Searches multiple Common Crawl Indexes for URL pattern.

Args:
Expand Down