-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Mark Howison
committed
Sep 6, 2023
1 parent
402727b
commit 9417976
Showing
11 changed files
with
851 additions
and
10 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,31 @@ | ||
## My Project | ||
# job-posting-structure | ||
|
||
TODO: Fill this README out! | ||
Parses structured information from HTML-formatted job postings. | ||
|
||
Be sure to: | ||
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
|
||
* Change the title in this README | ||
* Edit your repository description on GitHub | ||
* Write in your license below and create a LICENSE file | ||
## JobStruct class | ||
|
||
## Security | ||
The primary class is called JobStruct and can be initialized from | ||
a filename, an HTML string, or an existing BeautifulSoup object that | ||
contains parsed HTML: | ||
|
||
See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. | ||
j = JobStruct.from_file("myJobPosting.html") | ||
|
||
## License | ||
with open("myJobPosting.html") as f: | ||
posting_html_str = f.read() | ||
j = JobStruct.from_string(posting_html_str) | ||
|
||
This library is licensed under the LICENSE NAME HERE License. | ||
posting_soup_obj = BeautifulSoup(posting_html_str, "html.parser") | ||
j = JobStruct.from_soup(posting_soup_obj) | ||
|
||
Once initialized, the JobStruct object has attributes for each segment | ||
that was parsed from the job posting: | ||
|
||
* description | ||
* benefits | ||
* qualitifications | ||
* responsibilities | ||
* requirements | ||
* eeo (Equal Employment Opportunity) | ||
* other |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[build-system] | ||
requires = ["setuptools"] | ||
build-backend = "setuptools.build_meta" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
beautifulsoup4 >= 4.0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
[metadata] | ||
name = jobstruct | ||
author = Suraj Maharjan | ||
author_email = [email protected] | ||
url = https://github.com/amazon-science/job-posting-structure | ||
version = attr: jobstruct.__version__ | ||
license = Creative Commons Non-Commercial 4.0 | ||
description = Parses structured information from HTML-formatted job postings. | ||
long_description = file: README.md | ||
long_description_content_type = text/markdown | ||
classifiers = | ||
Development Status :: 3 - Alpha | ||
License :: Free for non-commercial use | ||
Intended Audience :: Science/Research | ||
Operating System :: OS Independent | ||
Natural Language :: English | ||
Programming Language :: Python :: 3 | ||
Topic :: Scientific/Engineering | ||
Topic :: Scientific/Engineering :: Information Analysis | ||
|
||
[options] | ||
install_requires = file: requirements.txt | ||
python_requires = >= 3.8 | ||
setup_requires = setuptools |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: CC-BY-NC-4.0 | ||
|
||
from .jobstruct import JobStruct | ||
|
||
__version__ = "0.0.1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: CC-BY-NC-4.0 | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
class JobStruct: | ||
""" | ||
A class that represents a parsed HTML job posting, starting | ||
from either a filename, HTML text, or a BeautifulSoup object. | ||
The parsed segments of the job posting, available as attributes, are: | ||
* description | ||
* benefits | ||
* qualitifications | ||
* responsibilities | ||
* requirements | ||
* eeo (Equal Employment Opportunity) | ||
* other | ||
""" | ||
|
||
__TAGS = [ | ||
"p", | ||
"div", | ||
"h1", | ||
"h2", | ||
"h3", | ||
"h4", | ||
"h5", | ||
"h6", | ||
] | ||
|
||
__SEGMENTS = { | ||
"description": frozenset(( | ||
"description", | ||
"overview", | ||
"glance", | ||
"summary", | ||
"posting" | ||
)), | ||
"benefits": frozenset(( | ||
"perks", | ||
"benefits", | ||
"offer" | ||
)), | ||
"qualifications": frozenset(( | ||
"experience", | ||
"qualification", | ||
"qualifications", | ||
"skills", | ||
)), | ||
"responsibilities": frozenset(( | ||
"responsibilities", | ||
"duties", | ||
"functions", | ||
"function(s)" | ||
)), | ||
"requirements": frozenset(( | ||
"requirements", | ||
"required", | ||
"requirement" | ||
)), | ||
"eeo": frozenset(( | ||
"equal", | ||
"opportunity", | ||
"employer" | ||
)) | ||
} | ||
|
||
def __init__(self, soup: BeautifulSoup = None): | ||
""" | ||
Segments the HTML job posting `soup` that has been parsed by BeautifulSoup, | ||
and provides the segments as attributes. If no `soup` is provided, returns | ||
an empty structure. | ||
""" | ||
self._init_segments() | ||
if soup is not None: | ||
self.soup: BeautifulSoup = soup | ||
self._segment() | ||
self._add_attributes() | ||
|
||
|
||
@classmethod | ||
def from_file(cls, filename: str) -> "JobStruct": | ||
""" | ||
Creates a JobStruct object from the HTML in `filename`. | ||
""" | ||
with open(filename) as f: | ||
soup: BeautifulSoup = BeautifulSoup(f.read(), "html.parser") | ||
return cls(soup) | ||
|
||
|
||
@classmethod | ||
def from_string(cls, html: str) -> "JobStruct": | ||
""" | ||
Creates a JobStruct object from a `string` containing HTML. | ||
""" | ||
soup: BeautifulSoup = BeautifulSoup(html, "html.parser") | ||
return cls(soup) | ||
|
||
|
||
@classmethod | ||
def from_soup(cls, soup: BeautifulSoup) -> "JobStruct": | ||
""" | ||
Creates a JobStruct object from BeautifulSoup-parsed HTML in `soup`. | ||
""" | ||
return cls(soup) | ||
|
||
|
||
def to_dict(self): | ||
""" | ||
Convert the JobStruct object to a dictionary containing the segment | ||
attributes. | ||
""" | ||
return {segment: list(values) for segment, values in self.segments.items()} | ||
|
||
|
||
def _init_segments(self): | ||
""" | ||
Initial empty list for each segment type. | ||
""" | ||
self.segments = {segment: list() for segment in JobStruct.__SEGMENTS.keys()} | ||
# Other is the catch-all type for segments that don't match a keyword. | ||
self.segments["other"] = list() | ||
|
||
|
||
def _segment(self): | ||
""" | ||
Loop over HTML elements to find headings for each segment type and | ||
append the elements following the heading to the segment lists. | ||
""" | ||
segment = "other" | ||
for element in self.soup.body.find_all(JobStruct.__TAGS): | ||
text = element.get_text(separator="\n").strip() | ||
if text: | ||
if len(text.split()) <= 5: | ||
segment = self._classify_segment(text.lower()) | ||
elif self._is_terminal(element): | ||
for line in text.split("\n"): | ||
if "equal opportunity employer" in line: | ||
self.segments["eeo"].append(line) | ||
else: | ||
self.segments[segment].append(line) | ||
|
||
|
||
def _classify_segment(self, text: str): | ||
""" | ||
Classify `text` into one of the segment types using keywords. | ||
Defaults to "other" if no keywords were found. | ||
""" | ||
for segment, keywords in JobStruct.__SEGMENTS.items(): | ||
if any(word.strip(":") in keywords for word in text.split()): | ||
return segment | ||
return "other" | ||
|
||
|
||
def _is_terminal(self, element): | ||
""" | ||
""" | ||
return all( | ||
element.find(tag) is None | ||
for tag in JobStruct.__TAGS | ||
) | ||
|
||
|
||
def _add_attributes(self): | ||
""" | ||
Add attributes for each segment type to the returned object. | ||
""" | ||
for segment in self.segments.keys(): | ||
assert not hasattr(self, segment) | ||
setattr(self, segment, self.segments[segment]) | ||
|
||
|
||
def __str__(self): | ||
output = [] | ||
for segment, values in self.segments.items(): | ||
if segment != "other": | ||
if values: | ||
output.append(f"{segment}: [") | ||
for value in values: | ||
output.append(value) | ||
output.append("]") | ||
else: | ||
output.append(f"{segment}: []") | ||
return "\n".join(output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Marker file that indicates this package supports typing |
Oops, something went wrong.