Skip to content

Commit

Permalink
Merge pull request #172 from tcezard/get_location_from_ip
Browse files Browse the repository at this point in the history
Script to update the location of IP addresses that are missing it.
  • Loading branch information
tcezard authored Jun 20, 2024
2 parents 4d4e2f9 + 517b644 commit 031ef6f
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 4 deletions.
12 changes: 8 additions & 4 deletions eva-usage-stats/ws_query_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def _get_location(ip_address):
# }


@lru_cache
@lru_cache(maxsize=None)
def get_location(ip_address):
try:
return _get_location(ip_address)
Expand Down Expand Up @@ -104,10 +104,13 @@ def main():
'client_country_code', 'client_country_name', 'client_city', 'client_postal', 'client_latitude',
'client_longitude', 'client_state'
)
formatted_column_names = ', '.join((f'"{c}"' for c in column_name_tuple))
query = (
f"insert into eva_web_srvc_stats.ws_traffic values {column_name_tuple} ("
f"%s, %s, %s, %s, %s, %s, %s, %s, %s, cast(%s as timestamp with time zone), %s, %s, %s, %s, %s, %s, "
f"%s, %s, %s, %s, %s, %s, %s, %s, %s);")
f"insert into eva_web_srvc_stats.ws_traffic ({formatted_column_names}) values ("
f"%s, %s, %s, %s, %s, %s, %s, %s, %s, cast(%s as timestamp with time zone), "
f"%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
f"%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
f"%s, %s);")
result_cursor.execute(query, (
data["@timestamp"], data["@timestamp"], data["type"],
data["host"],
Expand All @@ -132,6 +135,7 @@ def main():
data["request_query"] if "request_query" in data else '',
data["cookie_header"] if "cookie_header" in data else '',
tot_segment_length,
location_dict.get('country_code'),
location_dict.get('country_name'),
location_dict.get('city'),
location_dict.get('postal'),
Expand Down
85 changes: 85 additions & 0 deletions eva-usage-stats/ws_query_fill_in_location.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/python
import datetime
from argparse import ArgumentParser
from functools import lru_cache

import requests
from ebi_eva_common_pyutils.logger import logging_config

from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query
from requests import HTTPError
from retry import retry

logger = logging_config.get_logger(__name__)

@retry(tries=5, delay=8, backoff=1.2, jitter=(1, 3))
def _get_location(ip_address):
response = requests.get('https://geolocation-db.com/json/' + ip_address)
response.raise_for_status()
return response.json()
# {
# "country_code":"NL",
# "country_name":"Netherlands",
# "city":"Amsterdam",
# "postal":"1105",
# "latitude":52.2965,
# "longitude":4.9542,
# "IPv4":"82.196.6.158",
# "state":"North Holland"
# }


@lru_cache(maxsize=None)
def get_location(ip_address):
try:
return _get_location(ip_address)
except HTTPError:
return {}


def main():
parser = ArgumentParser(description='')
parser.add_argument("--private-config-xml-file", help="ex: /path/to/eva-maven-settings.xml", required=True)
args = parser.parse_args()
logging_config.add_stdout_handler()
logger.info("Job ran at " + str(datetime.datetime.now()))

postgres_conn_handle = get_metadata_connection_handle("production_processing", args.private_config_xml_file)
chunk_size = 100
update_query = (
'UPDATE eva_web_srvc_stats.ws_traffic SET client_country_code=%s, client_country_name=%s, '
'client_city=%s, client_postal=%s, client_latitude=%s, client_longitude=%s, client_state=%s '
'WHERE client_ip=%s AND client_country_code is null;'
)
nb_ip = 0
nb_row_updated = 0
while True:
query = (f"SELECT distinct client_ip FROM eva_web_srvc_stats.ws_traffic "
f"where client_country_code is null limit {chunk_size};")
ip_addresses = list(get_all_results_for_query(postgres_conn_handle, query))

if len(ip_addresses) == 0:
break
for ip_address, in ip_addresses:
location_dict = get_location(ip_address)
with postgres_conn_handle.cursor() as update_cursor:
# execute the UPDATE statement
update_cursor.execute(
update_query,
(location_dict.get('country_code'), location_dict.get('country_name'), location_dict.get('city'),
location_dict.get('postal'), location_dict.get('latitude'), location_dict.get('longitude'),
location_dict.get('state'), ip_address)
)
updated_row_count = update_cursor.rowcount
logger.info(f'Updated {updated_row_count} record for {ip_address}')
nb_ip += 1
nb_row_updated += updated_row_count
# commit the changes to the database
logger.info(f'Committed {nb_row_updated} record for {nb_ip} IP addresses')
postgres_conn_handle.commit()
logger.info(f'Updated {nb_row_updated} record for {nb_ip} IP addresses')


if __name__ == '__main__':
main()

0 comments on commit 031ef6f

Please sign in to comment.