Skip to content

Commit

Permalink
refactors and reformats towards 0.3.0 release
Browse files Browse the repository at this point in the history
  • Loading branch information
WolfgangFahl committed Mar 29, 2024
1 parent 303f467 commit 73201e4
Show file tree
Hide file tree
Showing 29 changed files with 2,307 additions and 1,950 deletions.
50 changes: 0 additions & 50 deletions CHANGELOG.bak

This file was deleted.

4 changes: 0 additions & 4 deletions MANIFEST.in

This file was deleted.

59 changes: 31 additions & 28 deletions geograpy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,66 @@
'''
"""
main geograpy 3 module
'''
"""
__version__ = "0.3.0"
from geograpy.extraction import Extractor
from geograpy.places import PlaceContext
from geograpy.locator import Locator
from geograpy.labels import Labels
from geograpy.locator import Locator
from geograpy.places import PlaceContext


def get_geoPlace_context(url=None, text=None,debug=False):
'''
def get_geoPlace_context(url=None, text=None, debug=False):
"""
Get a place context for a given text with information
about country, region, city and other
based on NLTK Named Entities having the Geographic(GPE) label.
Args:
url(String): the url to read text from (if any)
text(String): the text to analyze
debug(boolean): if True show debug information
Returns:
places:
places:
PlaceContext: the place context
'''
places=get_place_context(url, text, labels=Labels.geo, debug=debug)
"""
places = get_place_context(url, text, labels=Labels.geo, debug=debug)
return places

def get_place_context(url=None, text=None,labels=Labels.default, debug=False):
'''


def get_place_context(url=None, text=None, labels=Labels.default, debug=False):
"""
Get a place context for a given text with information
about country, region, city and other
based on NLTK Named Entities in the label set Geographic(GPE),
based on NLTK Named Entities in the label set Geographic(GPE),
Person(PERSON) and Organization(ORGANIZATION).
Args:
url(String): the url to read text from (if any)
text(String): the text to analyze
debug(boolean): if True show debug information
Returns:
pc:
pc:
PlaceContext: the place context
'''
e = Extractor(url=url, text=text,debug=debug)
"""
e = Extractor(url=url, text=text, debug=debug)
e.find_entities(labels=labels)
places=e.places
places = e.places
pc = PlaceContext(places)
pc.setAll()
return pc

def locateCity(location,correctMisspelling=False,debug=False):
'''

def locateCity(location, correctMisspelling=False, debug=False):
"""
locate the given location string
Args:
location(string): the description of the location
Returns:
Locator: the location
'''
e = Extractor(text=location,debug=debug)
"""
e = Extractor(text=location, debug=debug)
e.split()
loc=Locator.getInstance(correctMisspelling=correctMisspelling,debug=debug)
city=loc.locateCity(e.places)
loc = Locator.getInstance(correctMisspelling=correctMisspelling, debug=debug)
city = loc.locateCity(e.places)
return city

84 changes: 45 additions & 39 deletions geograpy/extraction.py
Original file line number Diff line number Diff line change
@@ -1,93 +1,99 @@
import nltk
import re

import nltk
from newspaper import Article

from geograpy.labels import Labels


class Extractor(object):
'''
"""
Extract geo context for text or from url
'''
"""

def __init__(self, text=None, url=None, debug=False):
'''
"""
Constructor
Args:
text(string): the text to analyze
url(string): the url to read the text to analyze from
debug(boolean): if True show debug information
'''
"""
if not text and not url:
raise Exception('text or url is required')
self.debug=debug
raise Exception("text or url is required")
self.debug = debug
self.text = text
self.url = url
self.places = []
nltk_packages = ['maxent_ne_chunker',
'words',
'treebank',
'maxent_treebank_pos_tagger',
'punkt',
'averaged_perceptron_tagger'
]
nltk_packages = [
"maxent_ne_chunker",
"words",
"treebank",
"maxent_treebank_pos_tagger",
"punkt",
"averaged_perceptron_tagger",
]
for nltk_package in nltk_packages:
try:
import nltk

nltk.data.find(nltk_package)
except LookupError:
nltk.downloader.download(nltk_package, quiet=True)
import nltk
import nltk

def set_text(self):
'''
"""
Setter for text
'''
"""
if not self.text and self.url:
a = Article(self.url)
a.download()
a.parse()
self.text = a.text
def split(self,delimiter=r","):
'''

def split(self, delimiter=r","):
"""
simpler regular expression splitter with not entity check
hat tip: https://stackoverflow.com/a/1059601/1497139
'''
"""
self.set_text()
self.places=re.split(delimiter,self.text)
self.places = re.split(delimiter, self.text)

def find_geoEntities(self):
'''
"""
Find geographic entities
Returns:
list:
list:
List of places
'''
"""
self.find_entities(Labels.geo)
return self.places
def find_entities(self,labels=Labels.default):
'''

def find_entities(self, labels=Labels.default):
"""
Find entities with the given labels set self.places and returns it
Args:
labels:
labels:
Labels: The labels to filter
Returns:
list:
list:
List of places
'''
"""
self.set_text()

text = nltk.word_tokenize(self.text)
nes = nltk.ne_chunk(nltk.pos_tag(text))

for ne in nes:
if type(ne) is nltk.tree.Tree:
nelabel=ne.label()
if (nelabel in labels):
leaves=ne.leaves()
nelabel = ne.label()
if nelabel in labels:
leaves = ne.leaves()
if self.debug:
print(leaves)
self.places.append(u' '.join([i[0] for i in leaves]))
return self.places
self.places.append(" ".join([i[0] for i in leaves]))
return self.places
14 changes: 8 additions & 6 deletions geograpy/labels.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
'''
"""
Created on 2020-09-10
@author: wf
'''
"""


class Labels(object):
'''
"""
NLTK labels
'''
default=['GPE','GSP','PERSON','ORGANIZATION']
geo=['GPE','GSP']
"""

default = ["GPE", "GSP", "PERSON", "ORGANIZATION"]
geo = ["GPE", "GSP"]
Loading

0 comments on commit 73201e4

Please sign in to comment.