ANJ changes to SQL, capturing in repo

ukwa · Feb 2, 2024 · bfb3861 · bfb3861
1 parent 6a0b93c
commit bfb3861
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 __pycache__
 build
 *.egg-info
+.venv/*
+*.csv
+nohup.out
diff --git a/duckdb-query.py b/duckdb-query.py
@@ -18,19 +18,21 @@ def run_print_and_save(query, csv_file):
 
 # Scan for activity from a particular url_domain
 #run_print_and_save(f"SELECT * FROM '{in_file}' WHERE url_domain == 'bbc.co.uk' LIMIT 10", "some_rows.csv")
-run_print_and_save(f"SELECT * FROM '{in_file}' WHERE status_code == 299 LIMIT 10", "some_rows.csv")
+run_print_and_save(f"SELECT * FROM '{in_file}' WHERE url LIKE '%service.gov.uk%' LIMIT 10", "some_rows.csv")
+#run_print_and_save(f"SELECT * FROM '{in_file}' WHERE status_code == -5002 LIMIT 10", "some_rows.csv")
 
 #run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv")
 #print(duckdb.query(f"SELECT domain, status_code, COUNT(*) from '{in_file}' GROUP BY domain, status_code ORDER BY COUNT(*) DESC"))
 #print(duckdb.query(f"SELECT domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' WHERE domain == 'bbc.co.uk' GROUP BY domain, status_code, content_type ORDER BY COUNT(*) DESC"))
 
-#run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv")
-#run_print_and_save(f"SELECT url, tries, start_time, duration, launch_time, (start_time - launch_time) AS delay, webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 ORDER BY delay DESC LIMIT 100", "some_rows.csv")
-
 #print(duckdb.query(f"SELECT url_domain, status_code, COUNT(*) from '{in_file}' GROUP BY url_domain, status_code ORDER BY COUNT(*) DESC"))
 #print(duckdb.query(f"SELECT url_domain, status_code, content_type, SUM(content_length), COUNT(*) from '{in_file}' GROUP BY url_domain, status_code, content_type ORDER BY COUNT(*) DESC"))
-print(duckdb.query(f"SELECT status_code, SUM(content_length) AS total_bytes, COUNT(*) AS total_records from '{in_file}' GROUP BY status_code ORDER BY COUNT(*) DESC"))
+run_print_and_save(f"SELECT status_code, SUM(content_length) AS total_bytes, COUNT(*) AS total_records from '{in_file}' GROUP BY status_code ORDER BY COUNT(*) DESC", "status_codes.csv")
+
+run_print_and_save(f"SELECT status_code, content_type, url_hostname, SUM(content_length) AS total_bytes, COUNT(*) AS total_records from '{in_file}' WHERE url LIKE '%gov.uk%' GROUP BY status_code, content_type, url_hostname ORDER BY COUNT(*) DESC", "gov_uk_summary.csv")
+run_print_and_save(f"SELECT status_code, content_type, SUM(content_length) AS total_bytes, COUNT(*) AS total_records from '{in_file}' WHERE url LIKE '%service.gov.uk%' GROUP BY status_code, content_type ORDER BY COUNT(*) DESC", "gov_uk_assets_summary.csv")
 
+#print(duckdb.query(f"SELECT url_domain, status_code, annotations, SUM(content_length), COUNT(*) from '{in_file}' WHERE url_domain == 'bbc.co.uk' GROUP BY url_domain, status_code, annotations  ORDER BY COUNT(*) DESC"))
 #print(duckdb.query(f"SELECT url_domain, status_code, annotations, SUM(content_length), COUNT(*) from '{in_file}' WHERE url_domain == 'bbc.co.uk' GROUP BY url_domain, status_code, annotations  ORDER BY COUNT(*) DESC"))
 
 #run_print_and_save(f"SELECT DATE_TRUNC('hour', STRPTIME(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) as start_hour, COUNT(*) \
@@ -47,3 +49,6 @@ def run_print_and_save(query, csv_file):
 #print(duckdb.query(f"SELECT url_host, COUNT(*) from '{in_file}' WHERE status_code > 0 GROUP BY url_host ORDER BY COUNT(*) DESC"))
 #print(duckdb.query(f"SELECT url_domain, COUNT(DISTINCT url_host) from '{in_file}' GROUP BY url_domain ORDER BY COUNT(DISTINCT url_host) DESC"))
 
+#run_print_and_save(f"SELECT url, start_time, STRPTIME(COALESCE(NULLIF(REGEXP_EXTRACT(annotations, '.*launchTimestamp:([0-9]+).*',1),''),'20300101000000'),'%Y%m%d%H%M%S') AS launch_time, (start_time - launch_time) AS delay, REGEXP_EXTRACT(annotations, '.*WebRenderStatus:([0-9]+).*',1) AS webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 AND delay IS NOT NULL ORDER BY delay DESC LIMIT 100", "some_rows.csv")
+run_print_and_save(f"SELECT url, tries, start_time, duration, launch_time, (start_time - launch_time) AS delay, webrender_status_code, annotations FROM '{in_file}' WHERE status_code == -5002 ORDER BY delay DESC LIMIT 100", "delayed_webrenders.csv")
+