-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrepapp_scraper.py
84 lines (67 loc) · 3.02 KB
/
grepapp_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import click
import requests
import csv
SEARCH_URL = 'https://grep.app/api/search?q={}&{}&page={}'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
def fetch_results(term, languages, exclude, max_pages, show_snippet):
results = []
page = 1
lang_filters = "&".join([f"f.lang={lang}" for lang in languages.split(',')])
while page == 1 or (results and (not max_pages or page <= max_pages)):
page_url = SEARCH_URL.format(term, lang_filters, page)
# print(f"Fetching: {page_url}")
response = requests.get(page_url, headers=HEADERS)
try:
result_page_json = response.json()
except requests.exceptions.JSONDecodeError:
click.echo("Error: Invalid JSON response")
break
hits = result_page_json.get('hits', {}).get('hits', [])
if not hits:
break # Stop if no new results
for hit in hits:
repo = hit['repo']['raw']
path = hit['path']['raw']
snippet = hit['content']['snippet'] if show_snippet else None
results.append({
"repo": repo,
"path": path,
"snippet": snippet
})
click.echo(f'Got {len(hits)} results from page {page}')
page += 1
# Apply exclude filter if provided
if exclude:
exclude_words = set(exclude.split(','))
results = [r for r in results if not any(word in r['repo'] for word in exclude_words)]
return results
def save_to_csv(results, filename="results.csv"):
"""Save the search results to a CSV file."""
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=["repo", "path", "snippet"])
writer.writeheader()
writer.writerows(results)
click.echo(f"Results saved to {filename}")
@click.command()
@click.option('--search-term', required=True, type=str, help='Search term to match')
@click.option('--lang', required=True, type=str, help='Filter results by language (comma-separated, e.g., Python,Java)')
@click.option('--exclude', default='', type=str, help='Exclude results by keywords (comma-separated)')
@click.option('--max', default=4, type=int, help='Maximum pages to fetch results from')
@click.option('--show-snippet', is_flag=True, help='Show code snippets in output')
@click.option('--csv', is_flag=True, help='Save results to CSV file')
def scrape(search_term, lang, exclude, max, show_snippet, csv):
results = fetch_results(search_term, lang, exclude, max, show_snippet)
click.echo(f'Found {len(results)} results.')
for res in results:
click.echo(f"\nRepository: {res['repo']}")
click.echo(f"File Path: {res['path']}")
if show_snippet and res["snippet"]:
click.echo("Code Snippet:")
click.echo(res["snippet"])
click.echo("-" * 40)
if csv:
save_to_csv(results)
if __name__ == '__main__':
scrape()