Skip to content

Commit 3cb837e

Browse files
committed
Add Socrata Data Nodes
1 parent 3fbba3d commit 3cb837e

File tree

4 files changed

+158
-0
lines changed

4 files changed

+158
-0
lines changed

knime_extension/geospatial_env.yml

+1
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,4 @@ dependencies:
3535
- pointpats=2.3.0
3636
- pip:
3737
- ipinfo==4.4.3
38+
- sodapy==2.2.0
Loading
Loading

knime_extension/src/nodes/opendata.py

+157
Original file line numberDiff line numberDiff line change
@@ -686,3 +686,160 @@ def execute(self, exec_context: knext.ExecutionContext):
686686
gdf = get_osmnx().geocode_to_gdf(self.placename)
687687
gdf = gdf.reset_index(drop=True)
688688
return knext.Table.from_pandas(gdf)
689+
690+
691+
############################################
692+
# Socrata Search
693+
############################################
694+
@knext.node(
695+
name="Socrata Search",
696+
node_type=knext.NodeType.SOURCE,
697+
icon_path=__NODE_ICON_PATH + "Socrata Search.png",
698+
category=__category,
699+
after="",
700+
)
701+
@knext.output_table(
702+
name="Socrata dataset list",
703+
description="Socrata dataset based on search keywords",
704+
)
705+
class SocrataSearchNode:
706+
"""Retrive the open data category via Socrata API.
707+
708+
The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
709+
This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset list.
710+
"""
711+
712+
queryitem = knext.StringParameter(
713+
label="Input searching item",
714+
description="""Enter search keywords or dataset names to find relevant datasets in the Socrata database.
715+
This search is not case-sensitive and can include multiple words separated by spaces. """,
716+
default_value="Massachusetts",
717+
)
718+
719+
def configure(self, configure_context):
720+
# TODO Create combined schema
721+
return None
722+
723+
def execute(self, exec_context: knext.ExecutionContext):
724+
from urllib.request import Request, urlopen
725+
import pandas as pd
726+
import json
727+
from pandas import json_normalize
728+
729+
query_item = self.queryitem
730+
request = Request(
731+
f"http://api.us.socrata.com/api/catalog/v1?q={query_item}&only=datasets&limit=10000"
732+
)
733+
734+
response = urlopen(request)
735+
response_body = response.read()
736+
737+
# Load the JSON response into a Python dictionary
738+
data = json.loads(response_body)
739+
740+
# Extract the "results" key, which contains the dataset information
741+
dataset_info = data["results"]
742+
743+
# Create a DataFrame from the dataset information, and flatten the nested dictionaries
744+
df = json_normalize(dataset_info)
745+
df = df.drop(
746+
columns=["classification.domain_tags", "classification.domain_metadata"]
747+
)
748+
749+
# Find List
750+
list_columns = [
751+
col for col in df.columns if any(isinstance(item, list) for item in df[col])
752+
]
753+
754+
# Drop error list column
755+
for col in list_columns:
756+
try:
757+
df[col] = df[col].apply(
758+
lambda x: ", ".join(x) if isinstance(x, list) else x
759+
)
760+
except Exception as e:
761+
df.drop(columns=[col], inplace=True)
762+
763+
# Drop columns that cannot be saved in KNIME
764+
drop_columns = []
765+
for col in df.columns:
766+
try:
767+
# Attempt to convert the column to a KNIME-compatible data type
768+
knime_table = knext.Table.from_pandas(df[[col]])
769+
except Exception as e:
770+
# If an exception is raised, add the column to the list of columns to drop
771+
drop_columns.append(col)
772+
773+
# Drop the columns that cannot be saved in KNIME
774+
df.drop(columns=drop_columns, inplace=True)
775+
df.replace("?", pd.NA, inplace=True)
776+
df.replace("", pd.NA, inplace=True)
777+
df.dropna(axis=1, how="all", inplace=True)
778+
df = df.reset_index(drop=True)
779+
return knext.Table.from_pandas(df)
780+
781+
782+
############################################
783+
# Socrata Data Query
784+
############################################
785+
@knext.node(
786+
name="Socrata Data Query",
787+
node_type=knext.NodeType.SOURCE,
788+
icon_path=__NODE_ICON_PATH + "Socrata Data Query.png",
789+
category=__category,
790+
after="",
791+
)
792+
@knext.output_table(
793+
name="Socrata dataset",
794+
description="Socrata dataset based on search keywords",
795+
)
796+
class SocrataDataNode:
797+
"""Retrive the open data category via Socrata API.
798+
799+
The Socrata Open Data API (SODA) is a powerful tool designed for programmatically accessing a vast array of open data resources from various organizations around the world, including governments, non-profits,and NGOs..
800+
This node uses the [SODA Consumer API](https://dev.socrata.com/consumers/getting-started.html) to get the dataset from a dataset list generated by Socrata Search Node.
801+
802+
For instance, this dataset [Incidence Rate Of Breast Cancer](https://opendata.utah.gov/Health/Incidence-Rate-Of-Breast-Cancer-Per-100-000-All-St/q22t-rbk9) has a resource_id of "q22t-rbk9" and a metadata domain of "opendata.utah.gov".
803+
They can be found in the link under API,"https://opendata.utah.gov/resource/q22t-rbk9.json". Both the two items will be used for data retriving.
804+
"""
805+
806+
metadata_domain = knext.StringParameter(
807+
label="Metadata domain",
808+
description="""The value in the column metadata.domain of a table generated by a Socrata Search node. """,
809+
default_value="",
810+
)
811+
812+
resource_id = knext.StringParameter(
813+
label="Resource ID",
814+
description="""The value in the column resource.id of a table generated by a Socrata Search node. """,
815+
default_value="",
816+
)
817+
818+
def configure(self, configure_context):
819+
# TODO Create combined schema
820+
return None
821+
822+
def execute(self, exec_context: knext.ExecutionContext):
823+
import pandas as pd
824+
import json
825+
import pandas as pd
826+
from sodapy import Socrata
827+
828+
# Unauthenticated client only works with public data sets. Note 'None'
829+
# in place of application token, and no username or password:
830+
client = Socrata(self.metadata_domain, None)
831+
832+
# Example authenticated client (needed for non-public datasets):
833+
# client = Socrata(data.cdc.gov,
834+
# MyAppToken,
835+
# username="[email protected]",
836+
# password="AFakePassword")
837+
838+
# First 2000 results, returned as JSON from API / converted to Python list of
839+
# dictionaries by sodapy.
840+
results = client.get(self.resource_id, limit=100000)
841+
842+
# Convert to pandas DataFrame
843+
results_df = pd.DataFrame.from_records(results)
844+
845+
return knext.Table.from_pandas(results_df)

0 commit comments

Comments
 (0)