From f68c665d3507b28c6041fbfc07cc0ca0f2040c8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bompard?= Date: Mon, 19 Aug 2024 12:18:58 +0200 Subject: [PATCH] Factor out the address resolving and location functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also, follow the [line of sight](https://medium.com/@matryer/line-of-sight-in-code-186dd7cdea88) code style. Signed-off-by: Aurélien Bompard --- mirrormanager2/crawler/continents.py | 54 ++---------- mirrormanager2/lib/geo.py | 94 +++++++++++++++++++++ mirrormanager2/utility/generate_worldmap.py | 72 +++------------- 3 files changed, 112 insertions(+), 108 deletions(-) create mode 100644 mirrormanager2/lib/geo.py diff --git a/mirrormanager2/crawler/continents.py b/mirrormanager2/crawler/continents.py index 1b93d4a2..63fca49c 100755 --- a/mirrormanager2/crawler/continents.py +++ b/mirrormanager2/crawler/continents.py @@ -1,16 +1,14 @@ -import collections import csv import functools import logging import os -import socket from functools import cache from importlib import resources from urllib.parse import urlparse import geoip2 -import mirrormanager2.lib +from mirrormanager2.lib import geo, get_country_continent_redirect from .constants import CONTINENTS @@ -50,7 +48,7 @@ def get_country_continents(session): with country_continent_csv.open("r") as infile: reader = csv.reader(infile) new_country_continents = {rows[0]: rows[1] for rows in reader} - for c in mirrormanager2.lib.get_country_continent_redirect(session): + for c in get_country_continent_redirect(session): new_country_continents[c.country] = c.continent return new_country_continents @@ -81,57 +79,15 @@ def check_continent(config, options, session, categoryUrl): hostname = hostname.split(":")[0] try: - addrinfo = socket.getaddrinfo(hostname, None) - except socket.gaierror as e: + addresses = geo.get_host_addresses(hostname) + except geo.HostUnreachable as e: # Name resolution failed. This means # that the base URL is broken. raise BrokenBaseUrl() from e - # Extract the IPv4 and IPv6 address from the tuples returned by getaddrinfo. - addresses = set() - for family, _socktype, _proto, _canonname, sockaddr in addrinfo: - # The GeoIP2 databases contain only information for IPv4 and IPv6 - # addresses. Therefore, other, unusual address families are ignored. - if family == socket.AF_INET: - address, port = sockaddr - addresses.add(address) - elif family == socket.AF_INET6: - address, port, flowinfo, scope_id = sockaddr - addresses.add(address) - # Retrieve the ISO 3166-1 code for each address. - countries = [] - for address in addresses: - try: - country = gi.country(address) - except geoip2.errors.AddressNotFoundError: - # If no country object is found for an IPv4 or IPv6 address, - # the address is ignored. - pass - else: - iso_code = country.country.iso_code - # If the ISO 3166-1 code is not available, the country cannot be - # matched to continent. Therefore, the country object is ignored. - if iso_code is not None: - countries.append(iso_code) - # The GeoIP2 databases are not perfect and fully accurate. Therefore, - # multiple countries might be returned for hosts with multiple addresses. It - # seems best to use the most frequently occuring country if a host has - # multiple addresses. - country_counter = collections.Counter(countries) - if country_counter: - # most_common(1) returns a list with one element that is tuple that - # consists of the item and its count. - country = country_counter.most_common(1)[0][0] - else: - # For hosts with no country in the GeoIP database - # the default is 'US' as that is where most of - # Fedora infrastructure systems are running - country = "US" + country = geo.get_country(addresses, geoip_db=gi) if country in config["EMBARGOED_COUNTRIES"]: raise EmbargoedCountry(country) if country_continents[country] in continents: return - # And another return value. '8' is used for mirrors on - # the wrong continent. The crawl should not be listed in - # the database at all. raise WrongContinent diff --git a/mirrormanager2/lib/geo.py b/mirrormanager2/lib/geo.py new file mode 100644 index 00000000..bc8383ed --- /dev/null +++ b/mirrormanager2/lib/geo.py @@ -0,0 +1,94 @@ +import collections +import socket + +import geoip2 + + +class HostUnreachable(Exception): + pass + + +def get_host_addresses(hostname): + """Get the IP addresses for a hostname""" + try: + addrinfo = socket.getaddrinfo(hostname, None) + except socket.gaierror as e: + raise HostUnreachable(hostname) from e + + # Extract the IPv4 and IPv6 address from the tuples returned by getaddrinfo. + addresses = set() + for family, _socktype, _proto, _canonname, sockaddr in addrinfo: + # The GeoIP2 databases contain only information for IPv4 and IPv6 + # addresses. Therefore, other, unusual address families are ignored. + if family not in (socket.AF_INET, socket.AF_INET6): + continue + addresses.add(sockaddr[0]) + return addresses + + +def get_country(addresses, geoip_db): + """Retrieve the ISO 3166-1 code for each address.""" + countries = [] + for address in addresses: + try: + country = geoip_db.country(address) + except geoip2.errors.AddressNotFoundError: + # If no country object is found for an IPv4 or IPv6 address, + # the address is ignored. + continue + iso_code = country.country.iso_code + if iso_code is None: + # If the ISO 3166-1 code is not available, the country cannot be + # matched to continent. Therefore, the country object is ignored. + continue + countries.append(iso_code) + # The GeoIP2 databases are not perfect and fully accurate. Therefore, + # multiple countries might be returned for hosts with multiple addresses. It + # seems best to use the most frequently occuring country if a host has + # multiple addresses. + country_counter = collections.Counter(countries) + if country_counter: + # most_common(1) returns a list with one element that is tuple that + # consists of the item and its count. + country = country_counter.most_common(1)[0][0] + else: + # For hosts with no country in the GeoIP database + # the default is 'US' as that is where most of + # Fedora infrastructure systems are running + country = "US" + return country + + +def get_cities(addresses, geoip_db): + """Retrieve the city object for each address.""" + cities = [] + for address in addresses: + try: + city = geoip_db.city(address) + except geoip2.errors.AddressNotFoundError: + # If no city object was found for an IPv4 or IPv6 + # address, the address is ignored. + continue + # It seems that an empty city record is returned when no + # city was found. If no city has been found for an IPv4 + # or IPv6 address, the address is ignored. + if city.city.name is None: + continue + cities.append(city) + # If no city objects were found, the location of a host cannot + # be determined. + if not cities: + return [] + city_names = [city.city.name for city in cities] + # Only the GeoIP2 Enterprise database has a confidence score for + # each city record. Therefore, it seems best to use the most + # frequently occuring city if a host has multiple addresses. + city_name_counter = collections.Counter(city_names) + # most_common(1) returns a list with one element that is tuple + # that consists of the item and its count. + most_common_city_name = city_name_counter.most_common(1)[0][0] + # Find a city object for the most common city name. Any city + # object should equivalent for a given city name. + for city in cities: + if most_common_city_name == city.city.name: + return city diff --git a/mirrormanager2/utility/generate_worldmap.py b/mirrormanager2/utility/generate_worldmap.py index 67962030..243a6a7c 100755 --- a/mirrormanager2/utility/generate_worldmap.py +++ b/mirrormanager2/utility/generate_worldmap.py @@ -7,15 +7,13 @@ # while the rest of MirrorManager is licensed MIT/X11 -import collections import os -import socket from urllib.parse import urlsplit import click import geoip2.database -import mirrormanager2.lib +from mirrormanager2.lib import geo, get_host_category_url, read_config from mirrormanager2.lib.database import get_db_manager from .common import config_option @@ -25,81 +23,37 @@ @config_option @click.option("--verbose", is_flag=True, default=False, help="show more details") def main(config, verbose): - config = mirrormanager2.lib.read_config(config) + config = read_config(config) gi = geoip2.database.Reader(os.path.join(config["GEOIP_BASE"], "GeoLite2-City.mmdb")) db_manager = get_db_manager(config) with db_manager.Session() as session: embargoed_countries = set(x.upper() for x in config["EMBARGOED_COUNTRIES"]) tracking = set() - for hcurl in mirrormanager2.lib.get_host_category_url(session): + for hcurl in get_host_category_url(session): host = hcurl.host_category.host if host.private or host.site.private: continue hostname = urlsplit(hcurl.url)[1] if host.id in tracking: continue - gir = None try: - addrinfo = socket.getaddrinfo(hostname, None) - # Extract the IPv4 and IPv6 address from the tuples returned by - # getaddrinfo. - addresses = set() - for family, _socktype, _proto, _canonname, sockaddr in addrinfo: - # The GeoIP2 databases contain only information for IPv4 and - # IPv6 addresses. Therefore, other, unusual address families - # are ignored. - if family == socket.AF_INET: - address, port = sockaddr - addresses.add(address) - elif family == socket.AF_INET6: - address, port, flowinfo, scope_id = sockaddr - addresses.add(address) - # Retrieve the city object for each address. - cities = [] - for address in addresses: - try: - city = gi.city(address) - except geoip2.errors.AddressNotFoundError: - # If no city object was found for an IPv4 or IPv6 - # address, the address is ignored. - pass - else: - # It seems that an empty city record is returned when no - # city was found. If no city has been found for an IPv4 - # or IPv6 address, the address is ignored. - if city.city.name is not None: - cities.append(city) - # If no city objects were found, the location of a host cannot - # be determined. - if not cities: - continue - city_names = (city.city.name for city in cities) - # Only the GeoIP2 Enterprise database has a confidence score for - # each city record. Therefore, it seems best to use the most - # frequently occuring city if a host has multiple addresses. - city_name_counter = collections.Counter(city_names) - # most_common(1) returns a list with one element that is tuple - # that consists of the item and its count. - most_common_city_name = city_name_counter.most_common(1)[0][0] - # Find a city object for the most common city name. Any city - # object should equivalent for a given city name. - for city in cities: - if most_common_city_name == city.city.name: - gir = city - break - except Exception: + addresses = geo.get_host_addresses(hostname) + except geo.HostUnreachable: + click.echo(f"Unreachable host: {hostname}. Skipping.", err=True) continue - if gir is None: + + city = geo.get_city(addresses, geoip_db=gi) + if city is None: continue - if gir.country.iso_code in embargoed_countries: + if city.country.iso_code in embargoed_countries: click.echo( f"WARNING: host {host.id} ({hostname}) seems to be from an embargoed " - f"country: {gir.country.iso_code}", + f"country: {city.country.iso_code}", err=True, ) continue - host.latitude = gir.location.latitude - host.longitude = gir.location.longitude + host.latitude = city.location.latitude + host.longitude = city.location.longitude tracking.add(host.id) if verbose: click.echo(f"{host.name} ({host.id}): {host.latitude} {host.longitude}")