diff --git a/eva-usage-stats/ws_query_analysis.py b/eva-usage-stats/ws_query_analysis.py index c25ad237..083df337 100644 --- a/eva-usage-stats/ws_query_analysis.py +++ b/eva-usage-stats/ws_query_analysis.py @@ -31,7 +31,7 @@ def _get_location(ip_address): # } -@lru_cache +@lru_cache(maxsize=None) def get_location(ip_address): try: return _get_location(ip_address) @@ -104,10 +104,13 @@ def main(): 'client_country_code', 'client_country_name', 'client_city', 'client_postal', 'client_latitude', 'client_longitude', 'client_state' ) + formatted_column_names = ', '.join((f'"{c}"' for c in column_name_tuple)) query = ( - f"insert into eva_web_srvc_stats.ws_traffic values {column_name_tuple} (" - f"%s, %s, %s, %s, %s, %s, %s, %s, %s, cast(%s as timestamp with time zone), %s, %s, %s, %s, %s, %s, " - f"%s, %s, %s, %s, %s, %s, %s, %s, %s);") + f"insert into eva_web_srvc_stats.ws_traffic ({formatted_column_names}) values (" + f"%s, %s, %s, %s, %s, %s, %s, %s, %s, cast(%s as timestamp with time zone), " + f"%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " + f"%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " + f"%s, %s);") result_cursor.execute(query, ( data["@timestamp"], data["@timestamp"], data["type"], data["host"], @@ -132,6 +135,7 @@ def main(): data["request_query"] if "request_query" in data else '', data["cookie_header"] if "cookie_header" in data else '', tot_segment_length, + location_dict.get('country_code'), location_dict.get('country_name'), location_dict.get('city'), location_dict.get('postal'), diff --git a/eva-usage-stats/ws_query_fill_in_location.py b/eva-usage-stats/ws_query_fill_in_location.py new file mode 100644 index 00000000..6b8a2ca3 --- /dev/null +++ b/eva-usage-stats/ws_query_fill_in_location.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +import datetime +from argparse import ArgumentParser +from functools import lru_cache + +import requests +from ebi_eva_common_pyutils.logger import logging_config + +from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query +from requests import HTTPError +from retry import retry + +logger = logging_config.get_logger(__name__) + +@retry(tries=5, delay=8, backoff=1.2, jitter=(1, 3)) +def _get_location(ip_address): + response = requests.get('https://geolocation-db.com/json/' + ip_address) + response.raise_for_status() + return response.json() + # { + # "country_code":"NL", + # "country_name":"Netherlands", + # "city":"Amsterdam", + # "postal":"1105", + # "latitude":52.2965, + # "longitude":4.9542, + # "IPv4":"82.196.6.158", + # "state":"North Holland" + # } + + +@lru_cache(maxsize=None) +def get_location(ip_address): + try: + return _get_location(ip_address) + except HTTPError: + return {} + + +def main(): + parser = ArgumentParser(description='') + parser.add_argument("--private-config-xml-file", help="ex: /path/to/eva-maven-settings.xml", required=True) + args = parser.parse_args() + logging_config.add_stdout_handler() + logger.info("Job ran at " + str(datetime.datetime.now())) + + postgres_conn_handle = get_metadata_connection_handle("production_processing", args.private_config_xml_file) + chunk_size = 100 + update_query = ( + 'UPDATE eva_web_srvc_stats.ws_traffic SET client_country_code=%s, client_country_name=%s, ' + 'client_city=%s, client_postal=%s, client_latitude=%s, client_longitude=%s, client_state=%s ' + 'WHERE client_ip=%s AND client_country_code is null;' + ) + nb_ip = 0 + nb_row_updated = 0 + while True: + query = (f"SELECT distinct client_ip FROM eva_web_srvc_stats.ws_traffic " + f"where client_country_code is null limit {chunk_size};") + ip_addresses = list(get_all_results_for_query(postgres_conn_handle, query)) + + if len(ip_addresses) == 0: + break + for ip_address, in ip_addresses: + location_dict = get_location(ip_address) + with postgres_conn_handle.cursor() as update_cursor: + # execute the UPDATE statement + update_cursor.execute( + update_query, + (location_dict.get('country_code'), location_dict.get('country_name'), location_dict.get('city'), + location_dict.get('postal'), location_dict.get('latitude'), location_dict.get('longitude'), + location_dict.get('state'), ip_address) + ) + updated_row_count = update_cursor.rowcount + logger.info(f'Updated {updated_row_count} record for {ip_address}') + nb_ip += 1 + nb_row_updated += updated_row_count + # commit the changes to the database + logger.info(f'Committed {nb_row_updated} record for {nb_ip} IP addresses') + postgres_conn_handle.commit() + logger.info(f'Updated {nb_row_updated} record for {nb_ip} IP addresses') + + +if __name__ == '__main__': + main()