Skip to content

Commit

Permalink
update parse_address api
Browse files Browse the repository at this point in the history
  • Loading branch information
gmweaver committed Jun 7, 2024
1 parent 9861f49 commit c24851a
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 49 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ data_utils.set_data_dir_env_var(output_dir)
```python
from libpypostal import parser

parser.parse_address("123 Main St, Somewhere, DC 00000")
parser.parse_address("123 Main St, Somewhere, DC 00000", merge_multiple_matches=True)
```

## Versioning
Expand Down
64 changes: 46 additions & 18 deletions libpypostal/parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Python bindings to libpostal parse_address."""
from enum import Enum
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Literal, Optional, Tuple, Union, overload


class LibpostalAddressComponent(str, Enum):
class AddressComponent(str, Enum):
"""Libpostal address component."""

CATEGORY = "category"
Expand Down Expand Up @@ -38,38 +38,66 @@ def _parse_address(
)


@overload
def parse_address(
address: str, language: Optional[str] = None, country_code: Optional[str] = None
) -> Dict[str, List[str]]:
address: str,
merge_multiple_matches: Literal[True],
language: Optional[str] = None,
country_code: Optional[str] = None,
) -> Dict[AddressComponent, str]:
...


@overload
def parse_address(
address: str,
merge_multiple_matches: Literal[False],
language: Optional[str] = None,
country_code: Optional[str] = None,
) -> Dict[AddressComponent, List[str]]:
...


def parse_address(
address: str,
merge_multiple_matches: bool,
language: Optional[str] = None,
country_code: Optional[str] = None,
) -> Union[Dict[AddressComponent, str], Dict[AddressComponent, List[str]]]:
"""Parses address into components.
Arguments:
address: the address to parse.
merge_multiple_matches: indicates whether to merge multiple matches for a component.
language: optional language code to help localize parsing.
country_code: optional country code to help localize parsing.
Returns:
Dictionary of address components with format {<address component>: parsed value}.
Generally, address component lists will only have one element, but there is a
possibility of multiple matches. Address components not found in the input are
set to empty lists.
If merge_multiple_matches is set to False, the return type is Dict[str, List[str]]. If
set to True, a simple concatenation of matches is done with a single space and the return
type is Dict[str, str]. Given multiple matches are rare, many use cases may prefer the
simple concatenation, but there is still the ability to use custom logic to handle multiple
matches if needed.
"""
address_component_tuples = _parse_address(
address, language=language, country_code=country_code
)

parsed_address_components: Dict[str, List[str]] = {}

for address_component_tuple in address_component_tuples:
component_value, component_name = address_component_tuple

parsed_address_components: Dict[AddressComponent, List[str]] = {}
for component_value, component_name in address_component_tuples:
if component_name in parsed_address_components:
parsed_address_components[component_name].append(component_value)
parsed_address_components[AddressComponent(component_name)].append(
component_value
)
else:
parsed_address_components[component_name] = [component_value]
parsed_address_components[AddressComponent(component_name)] = [
component_value
]

for libpostal_address_component in LibpostalAddressComponent:
if libpostal_address_component.value not in parsed_address_components:
parsed_address_components[libpostal_address_component.value] = []
if merge_multiple_matches:
return {
component_name: " ".join(component_value)
for component_name, component_value in parsed_address_components.items()
}

return parsed_address_components
93 changes: 63 additions & 30 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,39 @@ def test_parse_address_single_match_each_component(
"state": ["ca"],
"postcode": ["90210"],
"country": ["us"],
"category": [],
"city_district": [],
"country_region": [],
"entrance": [],
"house": [],
"island": [],
"level": [],
"near": [],
"po_box": [],
"staircase": [],
"state_district": [],
"suburb": [],
"unit": [],
"world_region": [],
}

actual = parser.parse_address(test_address)
actual = parser.parse_address(test_address, merge_multiple_matches=False)

assert actual == expected
mock__parser_parse_address.assert_called_once_with(
test_address, language=None, country_code=None
)


@patch("libpypostal.parser._parse_address")
def test_parse_address_single_match_each_component_merged_matches(
mock__parser_parse_address: MagicMock,
) -> None:
test_address = "123 Bayes Way, Beverly Hills, CA 90210 US"
mock__parser_parse_address.return_value = [
("123", "house_number"),
("bayes way", "road"),
("beverly hills", "city"),
("ca", "state"),
("90210", "postcode"),
("us", "country"),
]
expected = {
"house_number": "123",
"road": "bayes way",
"city": "beverly hills",
"state": "ca",
"postcode": "90210",
"country": "us",
}

actual = parser.parse_address(test_address, merge_multiple_matches=True)

assert actual == expected
mock__parser_parse_address.assert_called_once_with(
Expand Down Expand Up @@ -69,23 +85,40 @@ def test_parse_address_multiple_matches_for_component(
"state": ["ca", "california"],
"postcode": ["90210"],
"country": ["us"],
"category": [],
"city_district": [],
"country_region": [],
"entrance": [],
"house": [],
"island": [],
"level": [],
"near": [],
"po_box": [],
"staircase": [],
"state_district": [],
"suburb": [],
"unit": [],
"world_region": [],
}

actual = parser.parse_address(test_address)
actual = parser.parse_address(test_address, merge_multiple_matches=False)

assert actual == expected
mock__parser_parse_address.assert_called_once_with(
test_address, language=None, country_code=None
)


@patch("libpypostal.parser._parse_address")
def test_parse_address_multiple_matches_for_component_merge_matches(
mock__parser_parse_address: MagicMock,
) -> None:
test_address = "123 Bayes Way, Beverly Hills, CA 90210 California US"
mock__parser_parse_address.return_value = [
("123", "house_number"),
("bayes way", "road"),
("beverly hills", "city"),
("ca", "state"),
("california", "state"),
("90210", "postcode"),
("us", "country"),
]
expected = {
"house_number": "123",
"road": "bayes way",
"city": "beverly hills",
"state": "ca california",
"postcode": "90210",
"country": "us",
}

actual = parser.parse_address(test_address, merge_multiple_matches=True)

assert actual == expected
mock__parser_parse_address.assert_called_once_with(
Expand Down

0 comments on commit c24851a

Please sign in to comment.