-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactors and reformats towards 0.3.0 release
- Loading branch information
1 parent
303f467
commit 73201e4
Showing
29 changed files
with
2,307 additions
and
1,950 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,66 @@ | ||
''' | ||
""" | ||
main geograpy 3 module | ||
''' | ||
""" | ||
__version__ = "0.3.0" | ||
from geograpy.extraction import Extractor | ||
from geograpy.places import PlaceContext | ||
from geograpy.locator import Locator | ||
from geograpy.labels import Labels | ||
from geograpy.locator import Locator | ||
from geograpy.places import PlaceContext | ||
|
||
|
||
def get_geoPlace_context(url=None, text=None,debug=False): | ||
''' | ||
def get_geoPlace_context(url=None, text=None, debug=False): | ||
""" | ||
Get a place context for a given text with information | ||
about country, region, city and other | ||
based on NLTK Named Entities having the Geographic(GPE) label. | ||
Args: | ||
url(String): the url to read text from (if any) | ||
text(String): the text to analyze | ||
debug(boolean): if True show debug information | ||
Returns: | ||
places: | ||
places: | ||
PlaceContext: the place context | ||
''' | ||
places=get_place_context(url, text, labels=Labels.geo, debug=debug) | ||
""" | ||
places = get_place_context(url, text, labels=Labels.geo, debug=debug) | ||
return places | ||
|
||
def get_place_context(url=None, text=None,labels=Labels.default, debug=False): | ||
''' | ||
|
||
|
||
def get_place_context(url=None, text=None, labels=Labels.default, debug=False): | ||
""" | ||
Get a place context for a given text with information | ||
about country, region, city and other | ||
based on NLTK Named Entities in the label set Geographic(GPE), | ||
based on NLTK Named Entities in the label set Geographic(GPE), | ||
Person(PERSON) and Organization(ORGANIZATION). | ||
Args: | ||
url(String): the url to read text from (if any) | ||
text(String): the text to analyze | ||
debug(boolean): if True show debug information | ||
Returns: | ||
pc: | ||
pc: | ||
PlaceContext: the place context | ||
''' | ||
e = Extractor(url=url, text=text,debug=debug) | ||
""" | ||
e = Extractor(url=url, text=text, debug=debug) | ||
e.find_entities(labels=labels) | ||
places=e.places | ||
places = e.places | ||
pc = PlaceContext(places) | ||
pc.setAll() | ||
return pc | ||
|
||
def locateCity(location,correctMisspelling=False,debug=False): | ||
''' | ||
|
||
def locateCity(location, correctMisspelling=False, debug=False): | ||
""" | ||
locate the given location string | ||
Args: | ||
location(string): the description of the location | ||
Returns: | ||
Locator: the location | ||
''' | ||
e = Extractor(text=location,debug=debug) | ||
""" | ||
e = Extractor(text=location, debug=debug) | ||
e.split() | ||
loc=Locator.getInstance(correctMisspelling=correctMisspelling,debug=debug) | ||
city=loc.locateCity(e.places) | ||
loc = Locator.getInstance(correctMisspelling=correctMisspelling, debug=debug) | ||
city = loc.locateCity(e.places) | ||
return city | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,93 +1,99 @@ | ||
import nltk | ||
import re | ||
|
||
import nltk | ||
from newspaper import Article | ||
|
||
from geograpy.labels import Labels | ||
|
||
|
||
class Extractor(object): | ||
''' | ||
""" | ||
Extract geo context for text or from url | ||
''' | ||
""" | ||
|
||
def __init__(self, text=None, url=None, debug=False): | ||
''' | ||
""" | ||
Constructor | ||
Args: | ||
text(string): the text to analyze | ||
url(string): the url to read the text to analyze from | ||
debug(boolean): if True show debug information | ||
''' | ||
""" | ||
if not text and not url: | ||
raise Exception('text or url is required') | ||
self.debug=debug | ||
raise Exception("text or url is required") | ||
self.debug = debug | ||
self.text = text | ||
self.url = url | ||
self.places = [] | ||
nltk_packages = ['maxent_ne_chunker', | ||
'words', | ||
'treebank', | ||
'maxent_treebank_pos_tagger', | ||
'punkt', | ||
'averaged_perceptron_tagger' | ||
] | ||
nltk_packages = [ | ||
"maxent_ne_chunker", | ||
"words", | ||
"treebank", | ||
"maxent_treebank_pos_tagger", | ||
"punkt", | ||
"averaged_perceptron_tagger", | ||
] | ||
for nltk_package in nltk_packages: | ||
try: | ||
import nltk | ||
|
||
nltk.data.find(nltk_package) | ||
except LookupError: | ||
nltk.downloader.download(nltk_package, quiet=True) | ||
import nltk | ||
import nltk | ||
|
||
def set_text(self): | ||
''' | ||
""" | ||
Setter for text | ||
''' | ||
""" | ||
if not self.text and self.url: | ||
a = Article(self.url) | ||
a.download() | ||
a.parse() | ||
self.text = a.text | ||
def split(self,delimiter=r","): | ||
''' | ||
|
||
def split(self, delimiter=r","): | ||
""" | ||
simpler regular expression splitter with not entity check | ||
hat tip: https://stackoverflow.com/a/1059601/1497139 | ||
''' | ||
""" | ||
self.set_text() | ||
self.places=re.split(delimiter,self.text) | ||
self.places = re.split(delimiter, self.text) | ||
|
||
def find_geoEntities(self): | ||
''' | ||
""" | ||
Find geographic entities | ||
Returns: | ||
list: | ||
list: | ||
List of places | ||
''' | ||
""" | ||
self.find_entities(Labels.geo) | ||
return self.places | ||
def find_entities(self,labels=Labels.default): | ||
''' | ||
|
||
def find_entities(self, labels=Labels.default): | ||
""" | ||
Find entities with the given labels set self.places and returns it | ||
Args: | ||
labels: | ||
labels: | ||
Labels: The labels to filter | ||
Returns: | ||
list: | ||
list: | ||
List of places | ||
''' | ||
""" | ||
self.set_text() | ||
|
||
text = nltk.word_tokenize(self.text) | ||
nes = nltk.ne_chunk(nltk.pos_tag(text)) | ||
|
||
for ne in nes: | ||
if type(ne) is nltk.tree.Tree: | ||
nelabel=ne.label() | ||
if (nelabel in labels): | ||
leaves=ne.leaves() | ||
nelabel = ne.label() | ||
if nelabel in labels: | ||
leaves = ne.leaves() | ||
if self.debug: | ||
print(leaves) | ||
self.places.append(u' '.join([i[0] for i in leaves])) | ||
return self.places | ||
self.places.append(" ".join([i[0] for i in leaves])) | ||
return self.places |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,14 @@ | ||
''' | ||
""" | ||
Created on 2020-09-10 | ||
@author: wf | ||
''' | ||
""" | ||
|
||
|
||
class Labels(object): | ||
''' | ||
""" | ||
NLTK labels | ||
''' | ||
default=['GPE','GSP','PERSON','ORGANIZATION'] | ||
geo=['GPE','GSP'] | ||
""" | ||
|
||
default = ["GPE", "GSP", "PERSON", "ORGANIZATION"] | ||
geo = ["GPE", "GSP"] |
Oops, something went wrong.