diff --git a/README.md b/README.md index ccb5117..c3e30f1 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ data_utils.set_data_dir_env_var(output_dir) ```python from libpypostal import parser -parser.parse_address("123 Main St, Somewhere, DC 00000") +parser.parse_address("123 Main St, Somewhere, DC 00000", merge_multiple_matches=True) ``` ## Versioning diff --git a/libpypostal/parser.py b/libpypostal/parser.py index 82fd9e6..3a4d3bf 100644 --- a/libpypostal/parser.py +++ b/libpypostal/parser.py @@ -1,9 +1,9 @@ """Python bindings to libpostal parse_address.""" from enum import Enum -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Literal, Optional, Tuple, Union, overload -class LibpostalAddressComponent(str, Enum): +class AddressComponent(str, Enum): """Libpostal address component.""" CATEGORY = "category" @@ -38,38 +38,66 @@ def _parse_address( ) +@overload def parse_address( - address: str, language: Optional[str] = None, country_code: Optional[str] = None -) -> Dict[str, List[str]]: + address: str, + merge_multiple_matches: Literal[True], + language: Optional[str] = None, + country_code: Optional[str] = None, +) -> Dict[AddressComponent, str]: + ... + + +@overload +def parse_address( + address: str, + merge_multiple_matches: Literal[False], + language: Optional[str] = None, + country_code: Optional[str] = None, +) -> Dict[AddressComponent, List[str]]: + ... + + +def parse_address( + address: str, + merge_multiple_matches: bool, + language: Optional[str] = None, + country_code: Optional[str] = None, +) -> Union[Dict[AddressComponent, str], Dict[AddressComponent, List[str]]]: """Parses address into components. Arguments: address: the address to parse. + merge_multiple_matches: indicates whether to merge multiple matches for a component. language: optional language code to help localize parsing. country_code: optional country code to help localize parsing. Returns: - Dictionary of address components with format {
: parsed value}. - Generally, address component lists will only have one element, but there is a - possibility of multiple matches. Address components not found in the input are - set to empty lists. + If merge_multiple_matches is set to False, the return type is Dict[str, List[str]]. If + set to True, a simple concatenation of matches is done with a single space and the return + type is Dict[str, str]. Given multiple matches are rare, many use cases may prefer the + simple concatenation, but there is still the ability to use custom logic to handle multiple + matches if needed. """ address_component_tuples = _parse_address( address, language=language, country_code=country_code ) - parsed_address_components: Dict[str, List[str]] = {} - - for address_component_tuple in address_component_tuples: - component_value, component_name = address_component_tuple - + parsed_address_components: Dict[AddressComponent, List[str]] = {} + for component_value, component_name in address_component_tuples: if component_name in parsed_address_components: - parsed_address_components[component_name].append(component_value) + parsed_address_components[AddressComponent(component_name)].append( + component_value + ) else: - parsed_address_components[component_name] = [component_value] + parsed_address_components[AddressComponent(component_name)] = [ + component_value + ] - for libpostal_address_component in LibpostalAddressComponent: - if libpostal_address_component.value not in parsed_address_components: - parsed_address_components[libpostal_address_component.value] = [] + if merge_multiple_matches: + return { + component_name: " ".join(component_value) + for component_name, component_value in parsed_address_components.items() + } return parsed_address_components diff --git a/tests/test_parser.py b/tests/test_parser.py index 250cdcc..1529a6b 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -24,23 +24,39 @@ def test_parse_address_single_match_each_component( "state": ["ca"], "postcode": ["90210"], "country": ["us"], - "category": [], - "city_district": [], - "country_region": [], - "entrance": [], - "house": [], - "island": [], - "level": [], - "near": [], - "po_box": [], - "staircase": [], - "state_district": [], - "suburb": [], - "unit": [], - "world_region": [], } - actual = parser.parse_address(test_address) + actual = parser.parse_address(test_address, merge_multiple_matches=False) + + assert actual == expected + mock__parser_parse_address.assert_called_once_with( + test_address, language=None, country_code=None + ) + + +@patch("libpypostal.parser._parse_address") +def test_parse_address_single_match_each_component_merged_matches( + mock__parser_parse_address: MagicMock, +) -> None: + test_address = "123 Bayes Way, Beverly Hills, CA 90210 US" + mock__parser_parse_address.return_value = [ + ("123", "house_number"), + ("bayes way", "road"), + ("beverly hills", "city"), + ("ca", "state"), + ("90210", "postcode"), + ("us", "country"), + ] + expected = { + "house_number": "123", + "road": "bayes way", + "city": "beverly hills", + "state": "ca", + "postcode": "90210", + "country": "us", + } + + actual = parser.parse_address(test_address, merge_multiple_matches=True) assert actual == expected mock__parser_parse_address.assert_called_once_with( @@ -69,23 +85,40 @@ def test_parse_address_multiple_matches_for_component( "state": ["ca", "california"], "postcode": ["90210"], "country": ["us"], - "category": [], - "city_district": [], - "country_region": [], - "entrance": [], - "house": [], - "island": [], - "level": [], - "near": [], - "po_box": [], - "staircase": [], - "state_district": [], - "suburb": [], - "unit": [], - "world_region": [], } - actual = parser.parse_address(test_address) + actual = parser.parse_address(test_address, merge_multiple_matches=False) + + assert actual == expected + mock__parser_parse_address.assert_called_once_with( + test_address, language=None, country_code=None + ) + + +@patch("libpypostal.parser._parse_address") +def test_parse_address_multiple_matches_for_component_merge_matches( + mock__parser_parse_address: MagicMock, +) -> None: + test_address = "123 Bayes Way, Beverly Hills, CA 90210 California US" + mock__parser_parse_address.return_value = [ + ("123", "house_number"), + ("bayes way", "road"), + ("beverly hills", "city"), + ("ca", "state"), + ("california", "state"), + ("90210", "postcode"), + ("us", "country"), + ] + expected = { + "house_number": "123", + "road": "bayes way", + "city": "beverly hills", + "state": "ca california", + "postcode": "90210", + "country": "us", + } + + actual = parser.parse_address(test_address, merge_multiple_matches=True) assert actual == expected mock__parser_parse_address.assert_called_once_with(