-
Notifications
You must be signed in to change notification settings - Fork 74
v0.2.55..v0.2.56 changeset AddressNormalizer.cpp
Garret Voltz edited this page Aug 14, 2020
·
3 revisions
diff --git a/hoot-core/src/main/cpp/hoot/core/conflate/address/AddressNormalizer.cpp b/hoot-core/src/main/cpp/hoot/core/conflate/address/AddressNormalizer.cpp
index d173ae8..2c220c3 100644
--- a/hoot-core/src/main/cpp/hoot/core/conflate/address/AddressNormalizer.cpp
+++ b/hoot-core/src/main/cpp/hoot/core/conflate/address/AddressNormalizer.cpp
@@ -30,7 +30,8 @@
// hoot
#include <hoot/core/util/Log.h>
#include <hoot/core/conflate/address/LibPostalInit.h>
-#include <hoot/core/conflate/address/AddressTagKeys.h>
+#include <hoot/core/conflate/address/Address.h>
+#include <hoot/core/util/StringUtils.h>
// libpostal
#include <libpostal/libpostal.h>
@@ -41,11 +42,12 @@ namespace hoot
AddressNormalizer::AddressNormalizer() :
_numNormalized(0)
{
+ _addressTagKeys.reset(new AddressTagKeys());
}
void AddressNormalizer::normalizeAddresses(const ElementPtr& e)
{
- const QSet<QString> addressTagKeys = AddressTagKeys::getInstance().getAddressTagKeys(*e);
+ const QSet<QString> addressTagKeys = _addressTagKeys->getAddressTagKeys(*e);
LOG_VART(addressTagKeys);
for (QSet<QString>::const_iterator addressTagKeyItr = addressTagKeys.begin();
addressTagKeyItr != addressTagKeys.end(); ++addressTagKeyItr)
@@ -84,13 +86,30 @@ void AddressNormalizer::normalizeAddresses(const ElementPtr& e)
QSet<QString> AddressNormalizer::normalizeAddress(const QString& address) const
{
+ const QString addressToNormalize = address.trimmed().simplified();
+ if (!Address::isStreetIntersectionAddress(addressToNormalize))
+ {
+ return _normalizeAddressWithLibPostal(addressToNormalize);
+ }
+ else
+ {
+ // libpostal doesn't handle intersections very well, so doing intersection normalization with
+ // custom logic
+ return _normalizeAddressIntersection(addressToNormalize);
+ }
+}
+
+QSet<QString> AddressNormalizer::_normalizeAddressWithLibPostal(const QString& address) const
+{
+ LOG_TRACE("Normalizing " << address << " with libpostal...");
+
+ QSet<QString> normalizedAddresses;
+ QString addressCopy = address;
+
// See note about init of this in AddressParser::parseAddresses.
LibPostalInit::getInstance();
- LOG_VART(address);
- const QString addressToNormalize = address.trimmed().simplified();
- LOG_VART(addressToNormalize);
- QSet<QString> normalizedAddresses;
+ _prepareAddressForLibPostalNormalization(addressCopy);
size_t num_expansions;
// specifying a language in the options is optional, but could we get better performance if
@@ -98,14 +117,13 @@ QSet<QString> AddressNormalizer::normalizeAddress(const QString& address) const
// first, of course)?
char** expansions =
libpostal_expand_address(
- addressToNormalize.toUtf8().data(), libpostal_get_default_options(),
- &num_expansions);
+ addressCopy.toUtf8().data(), libpostal_get_default_options(), &num_expansions);
// add all the normalizations libpostal finds as possible addresses
for (size_t i = 0; i < num_expansions; i++)
{
const QString normalizedAddress = QString::fromUtf8(expansions[i]);
LOG_VART(normalizedAddress);
- if (_isValidNormalizedAddress(addressToNormalize, normalizedAddress) &&
+ if (_isValidNormalizedAddress(addressCopy, normalizedAddress) &&
!normalizedAddresses.contains(normalizedAddress))
{
normalizedAddresses.insert(normalizedAddress);
@@ -122,6 +140,164 @@ QSet<QString> AddressNormalizer::normalizeAddress(const QString& address) const
return normalizedAddresses;
}
+QSet<QString> AddressNormalizer::_normalizeAddressIntersection(const QString& address) const
+{
+ LOG_TRACE("Normalizing intersection: " << address << "...");
+
+ const QMap<QString, QString> streetTypeAbbreviationsToFullTypes =
+ Address::getStreetTypeAbbreviationsToFullTypes();
+ const QStringList addressParts =
+ StringUtils::splitOnAny(address, Address::getIntersectionSplitTokens(), 2);
+ LOG_VART(addressParts.size());
+
+ if (addressParts.size() != 2)
+ {
+ throw IllegalArgumentException(
+ "A non-intersection address was passed into street intersection address normalization.");
+ }
+
+ // replace all street type abbreviations in both intersection parts in the address with their full
+ // name counterparts
+
+ QString modifiedAddress;
+ for (int i = 0; i < addressParts.size(); i++)
+ {
+ QString addressPart = addressParts.at(i).trimmed();
+ LOG_VART(addressPart);
+ for (QMap<QString, QString>::const_iterator itr = streetTypeAbbreviationsToFullTypes.begin();
+ itr != streetTypeAbbreviationsToFullTypes.end(); ++itr)
+ {
+ const QString abbrev = itr.key().trimmed();
+ LOG_VART(abbrev);
+ const QString fullType = itr.value().trimmed();
+ LOG_VART(fullType);
+
+ LOG_VART(addressPart.endsWith(abbrev, Qt::CaseInsensitive));
+ if (addressPart.endsWith(abbrev, Qt::CaseInsensitive))
+ {
+ StringUtils::replaceLastIndexOf(addressPart, abbrev, fullType);
+ LOG_VART(addressPart);
+ }
+ }
+ LOG_VART(addressPart);
+
+ modifiedAddress += addressPart.trimmed();
+ if (i == 0)
+ {
+ modifiedAddress += " and ";
+ }
+ }
+ LOG_VART(modifiedAddress);
+
+ // If one of the intersection parts has a street type and the other doesn't we'll copy one street
+ // type over to the other. This isn't foolproof as we could end up giving one of the intersections
+ // an incorrect street type (the actual element's address tags never get modified, though).
+ // However, it does help with address matching and will remain in place unless its found to be
+ // causing harm in some way.
+
+ QStringList modifiedAddressParts =
+ StringUtils::splitOnAny(modifiedAddress, Address::getIntersectionSplitTokens(), 2);
+ assert(modifiedAddressParts.size() == 2);
+ LOG_VART(modifiedAddressParts);
+ QString firstIntersectionPart = modifiedAddressParts[0].trimmed();
+ LOG_VART(firstIntersectionPart);
+ QString secondIntersectionPart = modifiedAddressParts[1].trimmed();
+ LOG_VART(secondIntersectionPart);
+
+ QStringList streetFullTypes;
+ QStringList streetPluralTypes;
+ const QStringList streetFullTypesTemp =
+ Address::getStreetFullTypesToTypeAbbreviations().keys();
+ streetFullTypes = streetFullTypesTemp;
+
+ // Sometimes intersections have plural street types (suffixes). We want to be able to handle those
+ // as well, but we don't want them plural in the final normalized address.
+ for (int i = 0; i < streetFullTypesTemp.size(); i++)
+ {
+ const QString pluralType = streetFullTypesTemp.at(i) + "s";
+ if (!streetFullTypes.contains(pluralType))
+ {
+ streetFullTypes.append(pluralType);
+ }
+ streetPluralTypes.append(pluralType);
+ }
+ LOG_VART(streetPluralTypes);
+
+ // remove any plural suffixes found; TODO: may not need all of this here
+ if (StringUtils::endsWithAny(firstIntersectionPart.trimmed(), streetPluralTypes))
+ {
+ firstIntersectionPart.chop(1);
+ LOG_VART(firstIntersectionPart);
+ }
+ if (StringUtils::endsWithAny(secondIntersectionPart.trimmed(), streetPluralTypes))
+ {
+ secondIntersectionPart.chop(1);
+ LOG_VART(secondIntersectionPart);
+ }
+ QStringList modifiedAddressPartsTemp;
+ for (int i = 0; i < modifiedAddressParts.size(); i++)
+ {
+ QString modifiedAddressPart = modifiedAddressParts.at(i);
+ if (StringUtils::endsWithAny(modifiedAddressPart.trimmed(), streetPluralTypes))
+ {
+ modifiedAddressPart.chop(1);
+ LOG_VART(modifiedAddressPart);
+ }
+ modifiedAddressPartsTemp.append(modifiedAddressPart);
+ }
+ modifiedAddressParts = modifiedAddressPartsTemp;
+ QString firstIntersectionEndingStreetType =
+ StringUtils::endsWithAnyAsStr(firstIntersectionPart.trimmed(), streetFullTypes).trimmed();
+ if (firstIntersectionEndingStreetType.endsWith('s'))
+ {
+ firstIntersectionEndingStreetType.chop(1);
+ }
+ LOG_VART(firstIntersectionEndingStreetType);
+ QString secondIntersectionEndingStreetType =
+ StringUtils::endsWithAnyAsStr(secondIntersectionPart.trimmed(), streetFullTypes).trimmed();
+ if (secondIntersectionEndingStreetType.endsWith('s'))
+ {
+ secondIntersectionEndingStreetType.chop(1);
+ }
+ LOG_VART(secondIntersectionEndingStreetType);
+
+ if (!firstIntersectionEndingStreetType.isEmpty() &&
+ secondIntersectionEndingStreetType.isEmpty())
+ {
+ LOG_VART(modifiedAddressParts[1]);
+ modifiedAddressParts[1] =
+ modifiedAddressParts[1].trimmed() + " " + firstIntersectionEndingStreetType.trimmed();
+ LOG_VART(modifiedAddressParts[1]);
+ }
+ else if (firstIntersectionEndingStreetType.isEmpty() &&
+ !secondIntersectionEndingStreetType.isEmpty())
+ {
+ LOG_VART(modifiedAddressParts[0]);
+ modifiedAddressParts[0] =
+ modifiedAddressParts[0].trimmed() + " " + secondIntersectionEndingStreetType.trimmed();
+ LOG_VART(modifiedAddressParts[0]);
+ }
+ modifiedAddress = modifiedAddressParts[0].trimmed() + " and " + modifiedAddressParts[1].trimmed();
+ LOG_VART(modifiedAddress);
+
+ // go ahead and send it to libpostal to finish out the normalization to avoid duplicating some
+ // code, although it probably won't change any from this point
+ return _normalizeAddressWithLibPostal(modifiedAddress);
+}
+
+void AddressNormalizer::_prepareAddressForLibPostalNormalization(QString& address)
+{
+ LOG_TRACE("Before normalization fix: " << address);
+ LOG_VART(Address::isStreetIntersectionAddress(address));
+ // This is a nasty thing libpostal does where it changes "St" to "Saint" when it should be
+ // "Street".
+ if (address.endsWith("st", Qt::CaseInsensitive) && !Address::isStreetIntersectionAddress(address))
+ {
+ StringUtils::replaceLastIndexOf(address, "st", "Street");
+ }
+ LOG_TRACE("After normalization fix: " << address);
+}
+
bool AddressNormalizer::_isValidNormalizedAddress(const QString& inputAddress,
const QString& normalizedAddress)
{
@@ -131,7 +307,8 @@ bool AddressNormalizer::_isValidNormalizedAddress(const QString& inputAddress,
return false;
}
// This is a bit of hack, but I don't like the way libpostal is turning "St" or "Street" into
- // "Saint". Should probably look into configuration of libpostal for a possible fix instead.
+ // "Saint". Should probably look into configuration of libpostal or update it for a possible fix
+ // instead.
else if (normalizedAddress.endsWith("saint", Qt::CaseInsensitive) &&
(inputAddress.endsWith("street", Qt::CaseInsensitive) ||
inputAddress.endsWith("st", Qt::CaseInsensitive)))