From 1d7e09cfdb9a1ce2468583b5cbc4c3d235c43eff Mon Sep 17 00:00:00 2001 From: gmweaver Date: Tue, 4 Jun 2024 22:22:23 -0700 Subject: [PATCH] add python lib --- .github/workflows/wheels.yml | 31 +++++++ .gitignore | 7 ++ .gitmodules | 3 + MANIFEST.IN | 1 + README.md | 38 ++++++++ install_libpostal.sh | 39 ++++++++ libpostal | 1 + libpypostal/__init__.py | 0 libpypostal/parser.py | 69 ++++++++++++++ pyproject.toml | 10 +++ setup.py | 38 ++++++++ src/pyparser.c | 169 +++++++++++++++++++++++++++++++++++ src/pyutils.c | 128 ++++++++++++++++++++++++++ src/pyutils.h | 16 ++++ 14 files changed, 550 insertions(+) create mode 100644 .github/workflows/wheels.yml create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 MANIFEST.IN create mode 100644 README.md create mode 100755 install_libpostal.sh create mode 160000 libpostal create mode 100644 libpypostal/__init__.py create mode 100644 libpypostal/parser.py create mode 100644 pyproject.toml create mode 100644 setup.py create mode 100644 src/pyparser.c create mode 100644 src/pyutils.c create mode 100644 src/pyutils.h diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 0000000..dd95018 --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,31 @@ +name: Build + +on: [push, pull_request] + +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-13, macos-14] + + steps: + - uses: actions/checkout@v4 + + # Used to host cibuildwheel + - uses: actions/setup-python@v5 + + - name: Install cibuildwheel + run: python -m pip install cibuildwheel==2.18.1 + + - name: Build wheels + run: python -m cibuildwheel --output-dir wheelhouse + # to supply options, put them in 'env', like: + # env: + # CIBW_SOME_OPTION: value + + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} + path: ./wheelhouse/*.whl \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9c38350 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__ +*.egg-info +*.so +build +dist +.venv +.vscode \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..1257d7c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "libpostal"] + path = libpostal + url = https://github.com/openvenues/libpostal.git diff --git a/MANIFEST.IN b/MANIFEST.IN new file mode 100644 index 0000000..f30e5b2 --- /dev/null +++ b/MANIFEST.IN @@ -0,0 +1 @@ +include src/pyutils.h \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f551d58 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# pylibpostal + +Python wrapper for open-source libpostal project. Custom libary built internally due to lack of continued support for current Python wrapper libraries. + +## Usage + +### Install libpostal C library + +By default, libpostal will be installed when the Python package is installed, but without the data. + +The commands run to install are below. + +``` +git clone https://github.com/openvenues/libpostal \ + && cd libpostal \ + && ./bootstrap.sh \ + && ./configure --datadir=/tmp/libpostal_data_files --disable-data-download --disable-sse2 \ + && make -j4 \ + && make install \ + && ldconfig +``` + +- `--disable-data-download` disables downloading data when installing. +- `--disable-sse2` required for Mac M1. +- `ldconfig` only needed for linux. + +See https://github.com/openvenues/libpostal?tab=readme-ov-file#installation-maclinux for more details. + +### Downloading libpostal data + +``` +libpostal_data download all +``` + +## Contributing +To test the project, run `poetry test`. Test files may live together with the code or in a separate +directory, but in order for them to be discovered, they should end with `_test.py` +(e.g. `pylibpostal/something_test.py` or `pylibpostal_test/something_test.py`). diff --git a/install_libpostal.sh b/install_libpostal.sh new file mode 100755 index 0000000..0ddc102 --- /dev/null +++ b/install_libpostal.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +OS=$(uname -s) + +case $OS in + Linux) + echo "Detected Linux" + # Linux-specific commands here + ;; + Darwin) + echo "Detected macOS" + # macOS-specific commands here + ;; + FreeBSD) + echo "Detected FreeBSD" + # FreeBSD-specific commands here + ;; + *) + echo "OS not supported" + # Handle unsupported OS here + ;; +esac + +if [ "$OS" = "Linux" ]; then + sudo apt-get install clang curl autoconf automake libtool pkg-config +elif [ "$OS" = "Darwin" ]; then + brew install curl autoconf automake libtool pkg-config +fi + +cd libpostal +git checkout tags/v1.1 +./bootstrap.sh +./configure --datadir=/tmp/libpostal_data_files --disable-data-download --disable-sse2 +make -j4 +sudo make install + +if [ "$OS" = "Linux" ]; then + sudo ldconfig +fi diff --git a/libpostal b/libpostal new file mode 160000 index 0000000..8f2066b --- /dev/null +++ b/libpostal @@ -0,0 +1 @@ +Subproject commit 8f2066b1d30f4290adf59cacc429980f139b8545 diff --git a/libpypostal/__init__.py b/libpypostal/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libpypostal/parser.py b/libpypostal/parser.py new file mode 100644 index 0000000..21e54fe --- /dev/null +++ b/libpypostal/parser.py @@ -0,0 +1,69 @@ +"""Python bindings to libpostal parse_address.""" +from enum import Enum +from typing import Dict, List, Optional, Tuple + +from libpypostal import _parser # type: ignore # pylint: disable=no-name-in-module + + +class LibpostalAddressComponent(str, Enum): + """Libpostal address component.""" + + CATEGORY = "category" + CITY = "city" + CITY_DISTRICT = "city_district" + COUNTRY = "country" + COUNTRY_REGION = "country_region" + ENTRANCE = "entrance" + HOUSE = "house" + HOUSE_NUMBER = "house_number" + ISLAND = "island" + LEVEL = "level" + NEAR = "near" + PO_BOX = "po_box" + POSTCODE = "postcode" + ROAD = "road" + STAIRCASE = "staircase" + STATE = "state" + STATE_DISTRICT = "state_district" + SUBURB = "suburb" + UNIT = "unit" + WORLD_REGION = "world_region" + + +def parse_address( + address: str, language: Optional[str] = None, country_code: Optional[str] = None +) -> Dict[str, List[str]]: + """Parses address into components. + + Arguments: + address: the address to parse. + language: optional language code to help localize parsing. + country_code: optional country code to help localize parsing. + + Returns: + Dictionary of address components with format {
: parsed value}. + Generally, address component lists will only have one element, but there is a + possibility of multiple matches. Address components not found in the input are + set to empty lists. + """ + address_component_tuples: List[ + Tuple[str, str] + ] = _parser.parse_address( # pylint: disable=c-extension-no-member + address, language=language, country=country_code + ) + + parsed_address_components: Dict[str, List[str]] = {} + + for address_component_tuple in address_component_tuples: + component_value, component_name = address_component_tuple + + if component_name in parsed_address_components: + parsed_address_components[component_name].append(component_value) + else: + parsed_address_components[component_name] = [component_value] + + for libpostal_address_component in LibpostalAddressComponent: + if libpostal_address_component.value not in parsed_address_components: + parsed_address_components[libpostal_address_component.value] = [] + + return parsed_address_components diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a4c0f33 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[project] +name = "libpypostal" +version = "1.1.0" + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +include = ["libpypostal*"] \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3377f9e --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +import ctypes.util +import subprocess +import tempfile + +from setuptools import setup, Extension + + +def _libpostal_installed() -> bool: + """Checks if libpostal is installed.""" + return ctypes.util.find_library("postal") is not None + + +def _install_libpostal() -> None: + """Installs libpostal.""" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + ["./install_libpostal.sh", tempdir], + text=True, + capture_output=True, + check=True, + ) + + +if not _libpostal_installed(): + _install_libpostal() + +ext_modules = [ + Extension( + "libpypostal._parser", + sources=["src/pyparser.c", "src/pyutils.c"], + libraries=["postal"], + include_dirs=["/usr/local/include", "src/"], + library_dirs=["/usr/local/lib"], + extra_compile_args=["-std=c99"], + ), +] + +setup(ext_modules=ext_modules) diff --git a/src/pyparser.c b/src/pyparser.c new file mode 100644 index 0000000..786b827 --- /dev/null +++ b/src/pyparser.c @@ -0,0 +1,169 @@ +#include +#include + +#include "pyutils.h" + +struct module_state { + PyObject *error; +}; + +#define GETSTATE(m) ((struct module_state *)PyModule_GetState(m)) + +static PyObject *py_parse_address(PyObject *self, PyObject *args, + PyObject *keywords) { + PyObject *arg_input; + PyObject *arg_language = Py_None; + PyObject *arg_country = Py_None; + + PyObject *result = NULL; + + char *datadir = getenv("LIBPOSTAL_DATA_DIR"); + + if ((datadir != NULL) && (!libpostal_setup_datadir(datadir) || + !libpostal_setup_parser_datadir(datadir)) || + (!libpostal_setup() || !libpostal_setup_parser())) { + PyErr_SetString(PyExc_TypeError, "Error loading libpostal data"); + } + + static char *kwlist[] = {"address", "language", "country", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, "O|OO:pyparser", kwlist, + &arg_input, &arg_language, &arg_country)) { + return 0; + } + + char *input = PyObject_to_string(arg_input); + + if (input == NULL) { + return NULL; + } + + char *language = NULL; + + if (arg_language != Py_None) { + language = PyObject_to_string(arg_language); + if (language == NULL) { + goto exit_free_input; + } + } + + char *country = NULL; + + if (arg_country != Py_None) { + country = PyObject_to_string(arg_language); + if (country == NULL) { + goto exit_free_language; + } + } + + libpostal_address_parser_options_t options = + libpostal_get_address_parser_default_options(); + options.language = language; + options.country = country; + + libpostal_address_parser_response_t *parsed = + libpostal_parse_address(input, options); + if (parsed == NULL) { + goto exit_free_country; + } + + result = PyList_New((Py_ssize_t)parsed->num_components); + if (!result) { + goto exit_destroy_response; + } + + for (int i = 0; i < parsed->num_components; i++) { + char *component = parsed->components[i]; + char *label = parsed->labels[i]; + PyObject *component_unicode = PyUnicode_DecodeUTF8( + (const char *)component, strlen(component), "strict"); + if (component_unicode == NULL) { + Py_DECREF(result); + goto exit_destroy_response; + } + + PyObject *label_unicode = + PyUnicode_DecodeUTF8((const char *)label, strlen(label), "strict"); + if (label_unicode == NULL) { + Py_DECREF(component_unicode); + Py_DECREF(result); + goto exit_destroy_response; + } + PyObject *tuple = + Py_BuildValue("(OO)", component_unicode, label_unicode); + if (tuple == NULL) { + Py_DECREF(component_unicode); + Py_DECREF(label_unicode); + goto exit_destroy_response; + } + + // Note: PyList_SetItem steals a reference, so don't worry about DECREF + PyList_SetItem(result, (Py_ssize_t)i, tuple); + + Py_DECREF(component_unicode); + Py_DECREF(label_unicode); + } + +exit_destroy_response: + libpostal_address_parser_response_destroy(parsed); +exit_free_country: + if (country != NULL) { + free(country); + } +exit_free_language: + if (language != NULL) { + free(language); + } +exit_free_input: + if (input != NULL) { + free(input); + } + return result; +} + +static PyMethodDef parser_methods[] = { + {"parse_address", (PyCFunction)py_parse_address, + METH_VARARGS | METH_KEYWORDS, "parse_address(text, language, country)"}, + {NULL, NULL}, +}; + +static int parser_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int parser_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + libpostal_teardown(); + libpostal_teardown_parser(); + return 0; +} + +static struct PyModuleDef module_def = {PyModuleDef_HEAD_INIT, + "_parser", + NULL, + sizeof(struct module_state), + parser_methods, + NULL, + parser_traverse, + parser_clear, + NULL}; + +#define INITERROR return NULL + +PyObject *PyInit__parser(void) { + PyObject *module = PyModule_Create(&module_def); + + if (module == NULL) { + INITERROR; + } + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("_parser.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + + return module; +} diff --git a/src/pyutils.c b/src/pyutils.c new file mode 100644 index 0000000..b7fe274 --- /dev/null +++ b/src/pyutils.c @@ -0,0 +1,128 @@ +#include "pyutils.h" + +void string_array_destroy(char **strings, size_t num_strings) { + if (strings != NULL) { + for (size_t i = 0; i < num_strings; i++) { + if (strings[i] != NULL) { + free(strings[i]); + } + } + free(strings); + } +} + +char *PyObject_to_string(PyObject *obj) { + if (!PyUnicode_Check(obj)) { + if (!PyBytes_Check(obj)) { + PyErr_SetString(PyExc_TypeError, + "Parameter must be bytes or unicode"); + return NULL; + } + } + PyObject *unistr = PyUnicode_FromObject(obj); + + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be converted to unicode"); + return NULL; + } + + char *out = PyUnicode_AsUTF8(unistr); + + // Need to copy the string, otherwise it's a dup + char *out_copy = strdup(out); + + Py_XDECREF(unistr); + + return out_copy; +} + +char **PyObject_to_strings_max_len(PyObject *obj, ssize_t max_len, + size_t *num_strings) { + char **out = NULL; + size_t n = 0; + if (!PySequence_Check(obj)) { + return NULL; + } + + PyObject *seq = PySequence_Fast(obj, "Expected a sequence"); + Py_ssize_t len = PySequence_Length(obj); + + if (len > 0) { + out = calloc(len, sizeof(char *)); + if (out == NULL) { + return NULL; + } + + char *str = NULL; + + for (int i = 0; i < len; i++) { + PyObject *item = PySequence_Fast_GET_ITEM(seq, i); + + str = NULL; + + str = PyObject_to_string(item); + if (str == NULL) { + PyErr_SetString(PyExc_TypeError, + "all elements must be strings"); + goto exit_destroy_strings; + } + + if (max_len > 0 && strlen(str) >= max_len) { + PyErr_SetString(PyExc_TypeError, + "string exceeded maximum length"); + goto exit_destroy_strings; + } + + out[i] = str; + n++; + } + } + + if (n > 0) { + *num_strings = n; + } else { + free(out); + out = NULL; + *num_strings = 0; + } + + Py_DECREF(seq); + + return out; + +exit_destroy_strings: + for (size_t i = 0; i < len; i++) { + char *s = out[i]; + if (s != NULL) { + free(s); + } + } + free(out); + Py_DECREF(seq); + return 0; +} + +char **PyObject_to_strings(PyObject *obj, size_t *num_strings) { + return PyObject_to_strings_max_len(obj, -1, num_strings); +} + +PyObject *PyObject_from_strings(char **strings, size_t num_strings) { + PyObject *result = PyList_New((Py_ssize_t)num_strings); + if (!result) { + return NULL; + } + + for (int i = 0; i < num_strings; i++) { + char *str = strings[i]; + PyObject *u = + PyUnicode_DecodeUTF8((const char *)str, strlen(str), "strict"); + if (u == NULL) { + Py_DECREF(result); + return NULL; + } + // Note: PyList_SetItem steals a reference, so don't worry about DECREF + PyList_SetItem(result, (Py_ssize_t)i, u); + } + return result; +} diff --git a/src/pyutils.h b/src/pyutils.h new file mode 100644 index 0000000..ad7bd63 --- /dev/null +++ b/src/pyutils.h @@ -0,0 +1,16 @@ +#ifndef HAVE_PYPOSTAL_UTILS_H +#define HAVE_PYPOSTAL_UTILS_H + +#include +#include + +void string_array_destroy(char **strings, size_t num_strings); + +char *PyObject_to_string(PyObject *obj); +char **PyObject_to_strings_max_len(PyObject *obj, ssize_t max_len, + size_t *num_strings); +char **PyObject_to_strings(PyObject *obj, size_t *num_strings); + +PyObject *PyObject_from_strings(char **strings, size_t num_strings); + +#endif