Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use socket.getaddrinfo instead of socket.gethostbyname #305

Merged
merged 3 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 5 additions & 14 deletions mirrormanager2/crawler/continents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
import functools
import logging
import os
import socket
from functools import cache
from importlib import resources
from urllib.parse import urlparse

import geoip2

import mirrormanager2.lib
from mirrormanager2.lib import geo, get_country_continent_redirect

from .constants import CONTINENTS

Expand Down Expand Up @@ -49,7 +48,7 @@ def get_country_continents(session):
with country_continent_csv.open("r") as infile:
reader = csv.reader(infile)
new_country_continents = {rows[0]: rows[1] for rows in reader}
for c in mirrormanager2.lib.get_country_continent_redirect(session):
for c in get_country_continent_redirect(session):
new_country_continents[c.country] = c.continent
return new_country_continents

Expand Down Expand Up @@ -80,23 +79,15 @@ def check_continent(config, options, session, categoryUrl):
hostname = hostname.split(":")[0]

try:
hostname = socket.gethostbyname(hostname)
except Exception as e:
addresses = geo.get_host_addresses(hostname)
except geo.HostUnreachable as e:
# Name resolution failed. This means
# that the base URL is broken.
raise BrokenBaseUrl() from e

country = gi.country(hostname).country.iso_code
if not country:
# For hosts with no country in the GeoIP database
# the default is 'US' as that is where most of
# Fedora infrastructure systems are running
country = "US"
country = geo.get_country(addresses, geoip_db=gi)
if country in config["EMBARGOED_COUNTRIES"]:
raise EmbargoedCountry(country)
if country_continents[country] in continents:
return
# And another return value. '8' is used for mirrors on
# the wrong continent. The crawl should not be listed in
# the database at all.
raise WrongContinent
94 changes: 94 additions & 0 deletions mirrormanager2/lib/geo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import collections
import socket

import geoip2


class HostUnreachable(Exception):
pass


def get_host_addresses(hostname):
"""Get the IP addresses for a hostname"""
try:
addrinfo = socket.getaddrinfo(hostname, None)
except socket.gaierror as e:
raise HostUnreachable(hostname) from e

# Extract the IPv4 and IPv6 address from the tuples returned by getaddrinfo.
addresses = set()
for family, _socktype, _proto, _canonname, sockaddr in addrinfo:
# The GeoIP2 databases contain only information for IPv4 and IPv6
# addresses. Therefore, other, unusual address families are ignored.
if family not in (socket.AF_INET, socket.AF_INET6):
continue
addresses.add(sockaddr[0])
return addresses


def get_country(addresses, geoip_db):
"""Retrieve the ISO 3166-1 code for each address."""
countries = []
for address in addresses:
try:
country = geoip_db.country(address)
except geoip2.errors.AddressNotFoundError:
# If no country object is found for an IPv4 or IPv6 address,
# the address is ignored.
continue
iso_code = country.country.iso_code
if iso_code is None:
# If the ISO 3166-1 code is not available, the country cannot be
# matched to continent. Therefore, the country object is ignored.
continue
countries.append(iso_code)
# The GeoIP2 databases are not perfect and fully accurate. Therefore,
# multiple countries might be returned for hosts with multiple addresses. It
# seems best to use the most frequently occuring country if a host has
# multiple addresses.
country_counter = collections.Counter(countries)
if country_counter:
# most_common(1) returns a list with one element that is tuple that
# consists of the item and its count.
country = country_counter.most_common(1)[0][0]
else:
# For hosts with no country in the GeoIP database
# the default is 'US' as that is where most of
# Fedora infrastructure systems are running
country = "US"
return country


def get_cities(addresses, geoip_db):
"""Retrieve the city object for each address."""
cities = []
for address in addresses:
try:
city = geoip_db.city(address)
except geoip2.errors.AddressNotFoundError:
# If no city object was found for an IPv4 or IPv6
# address, the address is ignored.
continue
# It seems that an empty city record is returned when no
# city was found. If no city has been found for an IPv4
# or IPv6 address, the address is ignored.
if city.city.name is None:
continue
cities.append(city)
# If no city objects were found, the location of a host cannot
# be determined.
if not cities:
return []
city_names = [city.city.name for city in cities]
# Only the GeoIP2 Enterprise database has a confidence score for
# each city record. Therefore, it seems best to use the most
# frequently occuring city if a host has multiple addresses.
city_name_counter = collections.Counter(city_names)
# most_common(1) returns a list with one element that is tuple
# that consists of the item and its count.
most_common_city_name = city_name_counter.most_common(1)[0][0]
# Find a city object for the most common city name. Any city
# object should equivalent for a given city name.
for city in cities:
if most_common_city_name == city.city.name:
return city
25 changes: 13 additions & 12 deletions mirrormanager2/utility/generate_worldmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@


import os
import socket
from urllib.parse import urlsplit

import click
import geoip2.database

import mirrormanager2.lib
from mirrormanager2.lib import geo, get_host_category_url, read_config
from mirrormanager2.lib.database import get_db_manager

from .common import config_option
Expand All @@ -24,35 +23,37 @@
@config_option
@click.option("--verbose", is_flag=True, default=False, help="show more details")
def main(config, verbose):
config = mirrormanager2.lib.read_config(config)
config = read_config(config)
gi = geoip2.database.Reader(os.path.join(config["GEOIP_BASE"], "GeoLite2-City.mmdb"))
db_manager = get_db_manager(config)
with db_manager.Session() as session:
embargoed_countries = set(x.upper() for x in config["EMBARGOED_COUNTRIES"])
tracking = set()
for hcurl in mirrormanager2.lib.get_host_category_url(session):
for hcurl in get_host_category_url(session):
host = hcurl.host_category.host
if host.private or host.site.private:
continue
hostname = urlsplit(hcurl.url)[1]
if host.id in tracking:
continue
try:
ip = socket.gethostbyname(hostname)
gir = gi.city(ip)
except Exception:
addresses = geo.get_host_addresses(hostname)
except geo.HostUnreachable:
click.echo(f"Unreachable host: {hostname}. Skipping.", err=True)
continue
if gir is None:

city = geo.get_city(addresses, geoip_db=gi)
if city is None:
continue
if gir.country.iso_code in embargoed_countries:
if city.country.iso_code in embargoed_countries:
click.echo(
f"WARNING: host {host.id} ({hostname}) seems to be from an embargoed "
f"country: {gir.country.iso_code}",
f"country: {city.country.iso_code}",
err=True,
)
continue
host.latitude = gir.location.latitude
host.longitude = gir.location.longitude
host.latitude = city.location.latitude
host.longitude = city.location.longitude
tracking.add(host.id)
if verbose:
click.echo(f"{host.name} ({host.id}): {host.latitude} {host.longitude}")
Expand Down