From 0492a2651dd33d29b91baad8769496b2f1a1e341 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Thu, 30 Jan 2025 15:53:07 -0500 Subject: [PATCH 01/11] Scrape PDFs for EFS --- src/pudl_archiver/archivers/nrelefs.py | 167 +++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 src/pudl_archiver/archivers/nrelefs.py diff --git a/src/pudl_archiver/archivers/nrelefs.py b/src/pudl_archiver/archivers/nrelefs.py new file mode 100644 index 00000000..f9705b0e --- /dev/null +++ b/src/pudl_archiver/archivers/nrelefs.py @@ -0,0 +1,167 @@ +"""Download NREL Electrification Futures Study data.""" + +import re + +from pudl_archiver.archivers.classes import ( + AbstractDatasetArchiver, + ArchiveAwaitable, + ResourceInfo, +) + +# Main page +# https://www.nrel.gov/analysis/electrification-futures.html + +# Grab all data sites with the following formats +# https://data.nrel.gov/submissions/90 +# https://data.openei.org/submissions/4130 + +# Also grab all PDFs on the main page +BASE_URL = "https://www.nrel.gov/analysis/electrification-futures.html" + + +class NrelEFSArchiver(AbstractDatasetArchiver): + """NREL Electrification Futures Studies archiver.""" + + name = "nrelefs" + + async def get_resources(self) -> ArchiveAwaitable: + """Download NREL EFS resources. + + The main page links to a series of PDFs as well as data.nrel.gov and data.openei.org webpages + containing associated data for each report. + """ + data_link_pattern = re.compile( + r"https:\/\/data.nrel.gov\/submissions\/|https:\/\/data.openei.org\/submissions\/" + ) + # Regex for matching the two pages containing data for a study on the NREL EFS page. + + pdf_pattern = re.compile(r"\/docs\/fy(\d{2})osti\/\w*.pdf") + # Regex for matching a PDF on the page + + # From the main page, grab all the PDFs + pdf_links = await self.get_hyperlinks(BASE_URL, pdf_pattern) + for link, filename in pdf_links.items(): + # Flow through workflow to identify the version of the PDF, + # the final filename + + # Clean up file name + self.logger.info(f"Downloading {link}") + filename = ( + filename.lower() + .replace("\n", "") + .replace("electrification futures study:", "") + ) + filename = re.sub( + "[^a-zA-Z0-9 -]+", "", filename + ).strip() # Remove all non-word, digit space or - characters + filename = re.sub(r"\s+", "-", filename) # Replace 1+ space with a dash + + # Map technical reports to versions + technical_report_version_map = { + "operational-analysis-of-us-power-systems-with-increased-electrification-and-demand-side-flexibility": 6, + "scenarios-of-power-system-evolution-and-infrastructure-development-for-the-united-states": 5, + "methodological-approaches-for-assessing-long-term-power-system-impacts-of-end-use-electrificatio": 4, + "the-demand-side-grid-dsgrid-model-documentation": 3, + "scenarios-of-electric-technology-adoption-and-power-consumption-for-the-united-states": 2, + "end-use-electric-technology-cost-and-performance-projections-through-2050": 1, + } + + if filename in technical_report_version_map: + final_filename = f"nrelefs-{filename}.pdf" + partitions = { + "report_number": technical_report_version_map[filename], + "document_type": "technical_report", + } + + # Map "presentation slides" to version based on URL + elif filename == "presentation-slides": + link_to_version = { + "/docs/fy21osti/80167.pdf": 6, + "/docs/fy21osti/78783.pdf": 5, + "/docs/fy18osti/72096.pdf": 2, + } + + report_number = link_to_version[link] + final_filename = f"nrelefs-{str(report_number)}-{filename}.pdf" + partitions = { + "report_number": report_number, + "document_type": "presentation", + } + + # Handle 2 special cases + elif ( + filename + == "electrification-of-industry-summary-of-efs-industrial-sector-analysis" + ): + final_filename = f"nrelefs-{filename}.pdf" + partitions = { + "report_number": 2, + "document_type": "industrial_sector_presentation", + } + + elif filename == "the-demand-side-grid-dsgrid-model": + final_filename = f"nrelefs-{filename}.pdf" + partitions = {"report_number": 3, "document_type": "presentation"} + + # Ignore a few other PDF links on the page that aren't from the EFS + else: + self.logger.warn(f"Found {filename} at {link} but didn't download.") + continue + yield self.get_pdf_resource(final_filename, link, partitions) + + # For each data link found on the page, iterate through and download files + for link in await self.get_hyperlinks(BASE_URL, data_link_pattern): + yield self.get_version_resource(link=link) + + async def get_version_resource( + self, links: dict[str, str], year: int + ) -> ResourceInfo: + """Download all available data for a given page of EFS data. + + Resulting resource contains one zip file of CSVs per state/territory, plus a handful of .xlsx dictionary and geocoding files. + + Args: + links: filename->URL mapping for files to download + year: the year we're downloading data for + """ + # host = "https://data.openei.org" + # zip_path = self.download_directory / f"doelead-{year}.zip" + # data_paths_in_archive = set() + # for filename, link in sorted(links.items()): + # self.logger.info(f"Downloading {link}") + # download_path = self.download_directory / filename + # await self.download_file(f"{host}{link}", download_path) + # self.add_to_archive( + # zip_path=zip_path, + # filename=filename, + # blob=download_path.open("rb"), + # ) + # data_paths_in_archive.add(filename) + # # Don't want to leave multiple giant files on disk, so delete + # # immediately after they're safely stored in the ZIP + # download_path.unlink() + # return ResourceInfo( + # local_path=zip_path, + # partitions={"year": year}, + # layout=ZipLayout(file_paths=data_paths_in_archive), + # ) + + async def get_pdf_resource( + self, final_filename: str, link: str, partitions: dict[str, str | int] + ) -> ResourceInfo: + """Download PDF resource. + + Resulting resource contains one PDF file with information about the EFS dataset. + + Args: + link: filename->URL mapping for files to download + filename: the name of the file on the NREL EFS webpage + partitions: partitions for downloaded file + """ + download_path = self.download_directory / final_filename + full_link = f"https://www.nrel.gov/{link}" + await self.download_file(url=full_link, file_path=download_path) + return ResourceInfo( + local_path=download_path, + partitions=partitions, + ) From be4ba02c9573a63d3e7dad2e151b237ef58f8a85 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 12 Feb 2025 18:12:15 -0500 Subject: [PATCH 02/11] Add NREL Siting Lab dataset archiver --- src/pudl_archiver/archivers/nrel/siting.py | 120 +++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 src/pudl_archiver/archivers/nrel/siting.py diff --git a/src/pudl_archiver/archivers/nrel/siting.py b/src/pudl_archiver/archivers/nrel/siting.py new file mode 100644 index 00000000..b9db63de --- /dev/null +++ b/src/pudl_archiver/archivers/nrel/siting.py @@ -0,0 +1,120 @@ +"""Download data from the NREL siting lab data.""" + +import re + +from pudl_archiver.archivers.classes import ( + AbstractDatasetArchiver, + ArchiveAwaitable, + ResourceInfo, + retry_async, +) +from pudl_archiver.frictionless import ZipLayout + + +class NrelSitingArchiver(AbstractDatasetArchiver): + """NREL Siting Lab Data archiver.""" + + name: str = "nrelsiting" + base_url: str = "https://data.openei.org/siting_lab" + + async def get_resources(self) -> ArchiveAwaitable: + """Using data IDs, iterate and download all NREL Siting Lab files.""" + # The links on the table are hidden through Javascript. However, + # the IDs are exposed on this JS file, which links each dataset ID to an image. + # Rather than using Selenium, we can use this file to identify the links for all + # datasets hosted through the siting lab. + url = "https://data.openei.org/api" + data = { + "action": "getSubmissionStatistics", + "format": "json", + "s": "siting_lab", + } + response = await retry_async( + self.session.post, args=[url], kwargs={"data": data} + ) + data_dict = await response.json() + + self.logger.info( + f"Downloading data for {data_dict['numSubmissions']} datasets. {data_dict['numFiles']} files ({data_dict['sizeOfFiles'] / 1e-9} GB)." + ) + for item in data_dict["submissions"]: + yield self.get_siting_resources(item) + + async def download_nrel_data(self, dataset_id: str, dataset_link: str) -> set: + """For a given NREL dataset link, grab all PDFs and data links from the page.""" + # There are many file types here, so we match using the more general link pattern + # e.g., https://data.openei.org/files/6121/nexrad_4km.tif + # We also grab the PDF files, which are hosted on a different part of the + # NREL website. E.g., https://www.nrel.gov/docs/fy24osti/87843.pdf + download_links = set() + + data_pattern = re.compile(rf"files\/{dataset_id}\/") + pdf_data_pattern = re.compile(r"docs\/[\w\/]*.pdf$") + + # Get data + data_download_links = await self.get_hyperlinks(dataset_link, data_pattern) + for link in data_download_links: + full_link = f"https://data.openei.org{link}" + download_links.add(full_link) + + # Get PDFs + pdf_download_links = await self.get_hyperlinks(dataset_link, pdf_data_pattern) + download_links.update(pdf_download_links) + return download_links + + async def get_siting_resources(self, dataset_dict: dict[str, str | int | list]): + """Download all files for a siting resource.""" + dataset_id = dataset_dict["xdrId"] + + dataset_link = f"https://data.openei.org/submissions/{dataset_id}" + self.logger.info(f"Downloading files from {dataset_link}") + + # Create zipfile name from dataset name + title = dataset_dict["submissionName"] + dataset_name = title.lower().strip() + dataset_name = re.sub( + r"([^a-zA-Z0-9 ])", "", dataset_name + ) # Drop all non-space special characters + dataset_name = dataset_name.replace(" ", "-") + + zip_path = self.download_directory / f"nrelsiting-{dataset_name}.zip" + data_paths_in_archive = set() + + # First, get all the links from the page itself + data_links = await self.download_nrel_data( + dataset_id=dataset_id, dataset_link=dataset_link + ) + + # A few datasets have an additional linked data page: + # e.g., https://data.openei.org/submissions/1932 + additional_datasets_pattern = re.compile(r"\/submissions\/\d{4}") + links = await self.get_hyperlinks(dataset_link, additional_datasets_pattern) + + # For each additional dataset linked, iterate through the same process + for link in links: + additional_dataset_id = link.split("/")[-1] + additional_data_paths_in_archive = await self.download_nrel_data( + dataset_id=additional_dataset_id, dataset_link=link + ) + data_links.update(additional_data_paths_in_archive) + + # For each link we've collected, download it and add it to the zipfile + for link in set(data_links): # Use set to handle duplication + filename = link.split("/")[-1] + # This file shows up in multiple datasets, + # causing collision when they run concurrently. Rename it + # to avoid this problem. + if filename == "87843.pdf": + filename = f"{dataset_name}-technical-report.pdf" + + self.logger.debug(f"Downloading {link} to {filename} for {zip_path}.") + await self.download_add_to_archive_and_unlink( + url=link, filename=filename, zip_path=zip_path + ) + data_paths_in_archive.add(filename) + + return ResourceInfo( + local_path=zip_path, + partitions={"data_set": dataset_name}, + layout=ZipLayout(file_paths=data_paths_in_archive), + ) From 796a9094438a200ce0380e580cfcd9b4b9aaf428 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 12 Feb 2025 18:32:01 -0500 Subject: [PATCH 03/11] Revert to logging all downloads to debug pdf --- src/pudl_archiver/archivers/nrel/siting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/nrel/siting.py b/src/pudl_archiver/archivers/nrel/siting.py index b9db63de..635ce797 100644 --- a/src/pudl_archiver/archivers/nrel/siting.py +++ b/src/pudl_archiver/archivers/nrel/siting.py @@ -107,7 +107,7 @@ async def get_siting_resources(self, dataset_dict: dict[str, str | int | list]): if filename == "87843.pdf": filename = f"{dataset_name}-technical-report.pdf" - self.logger.debug(f"Downloading {link} to {filename} for {zip_path}.") + self.logger.info(f"Downloading {link} to {filename} for {zip_path}.") await self.download_add_to_archive_and_unlink( url=link, filename=filename, zip_path=zip_path ) From fce903bf8d7e78db70cfea4bfd4020386bddb0f2 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 14 Feb 2025 12:33:46 -0500 Subject: [PATCH 04/11] Update docs, drop concurrency --- .../nrel/{siting.py => nrelsiting.py} | 69 +++++++++++++++---- 1 file changed, 55 insertions(+), 14 deletions(-) rename src/pudl_archiver/archivers/nrel/{siting.py => nrelsiting.py} (66%) diff --git a/src/pudl_archiver/archivers/nrel/siting.py b/src/pudl_archiver/archivers/nrel/nrelsiting.py similarity index 66% rename from src/pudl_archiver/archivers/nrel/siting.py rename to src/pudl_archiver/archivers/nrel/nrelsiting.py index 635ce797..9e497957 100644 --- a/src/pudl_archiver/archivers/nrel/siting.py +++ b/src/pudl_archiver/archivers/nrel/nrelsiting.py @@ -2,6 +2,9 @@ import re +from pydantic import BaseModel +from pydantic.alias_generators import to_camel + from pudl_archiver.archivers.classes import ( AbstractDatasetArchiver, ArchiveAwaitable, @@ -11,36 +14,74 @@ from pudl_archiver.frictionless import ZipLayout +class NrelAPIData(BaseModel): + """Data transfer object from NREL API.""" + + class Submission(BaseModel): + """Metadata about a specific dataset.""" + + submission_name: str + xdr_id: int + num_resources: int + file_count: int + status: str + # There are a few other fields that we don't parse here + # e.g., update date formatted in unix timestamps. We could + # revisit this in the future. + + class Config: # noqa: D106 + alias_generator = to_camel + populate_by_name = True + + result: bool + num_submissions: int + num_resources: int + num_files: int + size_of_files: int + stati: dict[str, int] + submissions: list[Submission] + + class Config: # noqa: D106 + alias_generator = to_camel + populate_by_name = True + + class NrelSitingArchiver(AbstractDatasetArchiver): """NREL Siting Lab Data archiver.""" name: str = "nrelsiting" base_url: str = "https://data.openei.org/siting_lab" + concurrency_limit = 2 # The server can get a bit cranky, so let's be nice. async def get_resources(self) -> ArchiveAwaitable: """Using data IDs, iterate and download all NREL Siting Lab files.""" - # The links on the table are hidden through Javascript. However, - # the IDs are exposed on this JS file, which links each dataset ID to an image. - # Rather than using Selenium, we can use this file to identify the links for all - # datasets hosted through the siting lab. + # The links on the table are hidden through Javascript. However, we can hit + # the API to get a dictionary containing metadata on each of the datasets + # associated with the Siting Lab. url = "https://data.openei.org/api" data = { - "action": "getSubmissionStatistics", + "action": "getSubmissionStatistics", # Get high-level data about the submissions "format": "json", - "s": "siting_lab", + "s": "siting_lab", # The name of the lab's data we want } response = await retry_async( self.session.post, args=[url], kwargs={"data": data} ) + # This returns a data dictionary containing metadata on + # the number of submissions, files, the ID (xdrId) of the dataset + # that corresponds to the Open EI link, the name, description and more. data_dict = await response.json() + data_dict = NrelAPIData(**data_dict) self.logger.info( - f"Downloading data for {data_dict['numSubmissions']} datasets. {data_dict['numFiles']} files ({data_dict['sizeOfFiles'] / 1e-9} GB)." + f"Downloading data for {data_dict.num_submissions} datasets. {data_dict.num_files} files ({data_dict.size_of_files / 1e-9} GB)." ) - for item in data_dict["submissions"]: - yield self.get_siting_resources(item) + for dataset in data_dict.submissions: + yield self.get_siting_resources(dataset=dataset) - async def download_nrel_data(self, dataset_id: str, dataset_link: str) -> set: + async def compile_nrel_download_links( + self, dataset_id: str, dataset_link: str + ) -> set: """For a given NREL dataset link, grab all PDFs and data links from the page.""" # There are many file types here, so we match using the more general link pattern # e.g., https://data.openei.org/files/6121/nexrad_4km.tif @@ -62,15 +103,15 @@ async def download_nrel_data(self, dataset_id: str, dataset_link: str) -> set: download_links.update(pdf_download_links) return download_links - async def get_siting_resources(self, dataset_dict: dict[str, str | int | list]): + async def get_siting_resources(self, dataset: NrelAPIData.Submission): """Download all files for a siting resource.""" - dataset_id = dataset_dict["xdrId"] + dataset_id = dataset.xdr_id dataset_link = f"https://data.openei.org/submissions/{dataset_id}" self.logger.info(f"Downloading files from {dataset_link}") # Create zipfile name from dataset name - title = dataset_dict["submissionName"] + title = dataset.submission_name dataset_name = title.lower().strip() dataset_name = re.sub( r"([^a-zA-Z0-9 ])", "", dataset_name @@ -81,7 +122,7 @@ async def get_siting_resources(self, dataset_dict: dict[str, str | int | list]): data_paths_in_archive = set() # First, get all the links from the page itself - data_links = await self.download_nrel_data( + data_links = await self.compile_nrel_download_links( dataset_id=dataset_id, dataset_link=dataset_link ) From 8894756ffc43de71e217d0877bd7bc83e9da7544 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Mon, 17 Feb 2025 09:49:05 -0500 Subject: [PATCH 05/11] Remove duplicated NREL EFS from merge --- src/pudl_archiver/archivers/nrelefs.py | 167 ------------------------- 1 file changed, 167 deletions(-) delete mode 100644 src/pudl_archiver/archivers/nrelefs.py diff --git a/src/pudl_archiver/archivers/nrelefs.py b/src/pudl_archiver/archivers/nrelefs.py deleted file mode 100644 index f9705b0e..00000000 --- a/src/pudl_archiver/archivers/nrelefs.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Download NREL Electrification Futures Study data.""" - -import re - -from pudl_archiver.archivers.classes import ( - AbstractDatasetArchiver, - ArchiveAwaitable, - ResourceInfo, -) - -# Main page -# https://www.nrel.gov/analysis/electrification-futures.html - -# Grab all data sites with the following formats -# https://data.nrel.gov/submissions/90 -# https://data.openei.org/submissions/4130 - -# Also grab all PDFs on the main page -BASE_URL = "https://www.nrel.gov/analysis/electrification-futures.html" - - -class NrelEFSArchiver(AbstractDatasetArchiver): - """NREL Electrification Futures Studies archiver.""" - - name = "nrelefs" - - async def get_resources(self) -> ArchiveAwaitable: - """Download NREL EFS resources. - - The main page links to a series of PDFs as well as data.nrel.gov and data.openei.org webpages - containing associated data for each report. - """ - data_link_pattern = re.compile( - r"https:\/\/data.nrel.gov\/submissions\/|https:\/\/data.openei.org\/submissions\/" - ) - # Regex for matching the two pages containing data for a study on the NREL EFS page. - - pdf_pattern = re.compile(r"\/docs\/fy(\d{2})osti\/\w*.pdf") - # Regex for matching a PDF on the page - - # From the main page, grab all the PDFs - pdf_links = await self.get_hyperlinks(BASE_URL, pdf_pattern) - for link, filename in pdf_links.items(): - # Flow through workflow to identify the version of the PDF, - # the final filename - - # Clean up file name - self.logger.info(f"Downloading {link}") - filename = ( - filename.lower() - .replace("\n", "") - .replace("electrification futures study:", "") - ) - filename = re.sub( - "[^a-zA-Z0-9 -]+", "", filename - ).strip() # Remove all non-word, digit space or - characters - filename = re.sub(r"\s+", "-", filename) # Replace 1+ space with a dash - - # Map technical reports to versions - technical_report_version_map = { - "operational-analysis-of-us-power-systems-with-increased-electrification-and-demand-side-flexibility": 6, - "scenarios-of-power-system-evolution-and-infrastructure-development-for-the-united-states": 5, - "methodological-approaches-for-assessing-long-term-power-system-impacts-of-end-use-electrificatio": 4, - "the-demand-side-grid-dsgrid-model-documentation": 3, - "scenarios-of-electric-technology-adoption-and-power-consumption-for-the-united-states": 2, - "end-use-electric-technology-cost-and-performance-projections-through-2050": 1, - } - - if filename in technical_report_version_map: - final_filename = f"nrelefs-{filename}.pdf" - partitions = { - "report_number": technical_report_version_map[filename], - "document_type": "technical_report", - } - - # Map "presentation slides" to version based on URL - elif filename == "presentation-slides": - link_to_version = { - "/docs/fy21osti/80167.pdf": 6, - "/docs/fy21osti/78783.pdf": 5, - "/docs/fy18osti/72096.pdf": 2, - } - - report_number = link_to_version[link] - final_filename = f"nrelefs-{str(report_number)}-{filename}.pdf" - partitions = { - "report_number": report_number, - "document_type": "presentation", - } - - # Handle 2 special cases - elif ( - filename - == "electrification-of-industry-summary-of-efs-industrial-sector-analysis" - ): - final_filename = f"nrelefs-{filename}.pdf" - partitions = { - "report_number": 2, - "document_type": "industrial_sector_presentation", - } - - elif filename == "the-demand-side-grid-dsgrid-model": - final_filename = f"nrelefs-{filename}.pdf" - partitions = {"report_number": 3, "document_type": "presentation"} - - # Ignore a few other PDF links on the page that aren't from the EFS - else: - self.logger.warn(f"Found {filename} at {link} but didn't download.") - continue - yield self.get_pdf_resource(final_filename, link, partitions) - - # For each data link found on the page, iterate through and download files - for link in await self.get_hyperlinks(BASE_URL, data_link_pattern): - yield self.get_version_resource(link=link) - - async def get_version_resource( - self, links: dict[str, str], year: int - ) -> ResourceInfo: - """Download all available data for a given page of EFS data. - - Resulting resource contains one zip file of CSVs per state/territory, plus a handful of .xlsx dictionary and geocoding files. - - Args: - links: filename->URL mapping for files to download - year: the year we're downloading data for - """ - # host = "https://data.openei.org" - # zip_path = self.download_directory / f"doelead-{year}.zip" - # data_paths_in_archive = set() - # for filename, link in sorted(links.items()): - # self.logger.info(f"Downloading {link}") - # download_path = self.download_directory / filename - # await self.download_file(f"{host}{link}", download_path) - # self.add_to_archive( - # zip_path=zip_path, - # filename=filename, - # blob=download_path.open("rb"), - # ) - # data_paths_in_archive.add(filename) - # # Don't want to leave multiple giant files on disk, so delete - # # immediately after they're safely stored in the ZIP - # download_path.unlink() - # return ResourceInfo( - # local_path=zip_path, - # partitions={"year": year}, - # layout=ZipLayout(file_paths=data_paths_in_archive), - # ) - - async def get_pdf_resource( - self, final_filename: str, link: str, partitions: dict[str, str | int] - ) -> ResourceInfo: - """Download PDF resource. - - Resulting resource contains one PDF file with information about the EFS dataset. - - Args: - link: filename->URL mapping for files to download - filename: the name of the file on the NREL EFS webpage - partitions: partitions for downloaded file - """ - download_path = self.download_directory / final_filename - full_link = f"https://www.nrel.gov/{link}" - await self.download_file(url=full_link, file_path=download_path) - return ResourceInfo( - local_path=download_path, - partitions=partitions, - ) From 8ae849d73748889f0c3f1a6ed3a62f1f96a897e2 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Mon, 17 Feb 2025 16:40:56 -0500 Subject: [PATCH 06/11] Update logs, drop concurrency, add timeouts --- src/pudl_archiver/archivers/nrel/nrelsiting.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/pudl_archiver/archivers/nrel/nrelsiting.py b/src/pudl_archiver/archivers/nrel/nrelsiting.py index 9e497957..618bc4e4 100644 --- a/src/pudl_archiver/archivers/nrel/nrelsiting.py +++ b/src/pudl_archiver/archivers/nrel/nrelsiting.py @@ -1,5 +1,6 @@ """Download data from the NREL siting lab data.""" +import asyncio import re from pydantic import BaseModel @@ -51,7 +52,7 @@ class NrelSitingArchiver(AbstractDatasetArchiver): name: str = "nrelsiting" base_url: str = "https://data.openei.org/siting_lab" - concurrency_limit = 2 # The server can get a bit cranky, so let's be nice. + concurrency_limit = 1 # The server can get a bit cranky, so let's be nice. async def get_resources(self) -> ArchiveAwaitable: """Using data IDs, iterate and download all NREL Siting Lab files.""" @@ -74,7 +75,7 @@ async def get_resources(self) -> ArchiveAwaitable: data_dict = NrelAPIData(**data_dict) self.logger.info( - f"Downloading data for {data_dict.num_submissions} datasets. {data_dict.num_files} files ({data_dict.size_of_files / 1e-9} GB)." + f"Downloading data for {data_dict.num_submissions} datasets. {data_dict.num_files} files ({data_dict.size_of_files * 1e-9} GB)." ) for dataset in data_dict.submissions: yield self.get_siting_resources(dataset=dataset) @@ -108,7 +109,6 @@ async def get_siting_resources(self, dataset: NrelAPIData.Submission): dataset_id = dataset.xdr_id dataset_link = f"https://data.openei.org/submissions/{dataset_id}" - self.logger.info(f"Downloading files from {dataset_link}") # Create zipfile name from dataset name title = dataset.submission_name @@ -140,7 +140,12 @@ async def get_siting_resources(self, dataset: NrelAPIData.Submission): data_links.update(additional_data_paths_in_archive) # For each link we've collected, download it and add it to the zipfile - for link in set(data_links): # Use set to handle duplication + data_links = set(data_links) # Use set to handle duplication + self.logger.info( + f"{dataset.submission_name}: Downloading {len(data_links)} files associated with {dataset_link}" + ) + + for link in data_links: filename = link.split("/")[-1] # This file shows up in multiple datasets, # causing collision when they run concurrently. Rename it @@ -153,6 +158,7 @@ async def get_siting_resources(self, dataset: NrelAPIData.Submission): url=link, filename=filename, zip_path=zip_path ) data_paths_in_archive.add(filename) + await asyncio.sleep(10) # Attempt to reduce server throttling return ResourceInfo( local_path=zip_path, From 15f8089e4792ec4aec6e1554430ad34798959c87 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Mon, 17 Feb 2025 18:38:46 -0500 Subject: [PATCH 07/11] Restore deleted metadata, fix additional downloads method --- .../archivers/nrel/nrelsiting.py | 2 +- src/pudl_archiver/metadata/sources.py | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/nrel/nrelsiting.py b/src/pudl_archiver/archivers/nrel/nrelsiting.py index 618bc4e4..a04b2c0d 100644 --- a/src/pudl_archiver/archivers/nrel/nrelsiting.py +++ b/src/pudl_archiver/archivers/nrel/nrelsiting.py @@ -134,7 +134,7 @@ async def get_siting_resources(self, dataset: NrelAPIData.Submission): # For each additional dataset linked, iterate through the same process for link in links: additional_dataset_id = link.split("/")[-1] - additional_data_paths_in_archive = await self.download_nrel_data( + additional_data_paths_in_archive = await self.compile_nrel_download_links( dataset_id=additional_dataset_id, dataset_link=link ) data_links.update(additional_data_paths_in_archive) diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index f8eff8da..4beb4449 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -416,6 +416,34 @@ "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], }, + "nrelsiting": { + "title": "NREL Renewable Energy Siting Lab Data", + "path": "https://data.openei.org/siting_lab", + "description": ( + "This repository contains all data produced by the NREL Renewable Energy Siting Lab. " + "The Siting Lab offers information on solar energy siting regulations and zoning ordinances, " + "as well as supply curve data. Documentation particular to each dataset can be found in the " + "relevant dataset zipfile." + ), + "working_partitions": {}, + "keywords": sorted( + { + "nrel", + "siting", + "supply curves", + "pv", + "solar", + "wind", + "ordinances", + "setbacks", + "nexrad", + "moratoriums", + } + ), + "license_raw": LICENSES["us-govt"], + "license_pudl": LICENSES["cc-by-4.0"], + "contributors": [CONTRIBUTORS["catalyst-cooperative"]], + }, "nrelss": { "title": "NREL Standard Scenarios", "path": "https://www.nrel.gov/analysis/standard-scenarios.html", From 05776a25f81a15aacb6faa2973a359f47ff1954f Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 18 Feb 2025 11:38:24 -0500 Subject: [PATCH 08/11] Switch to using get_json --- src/pudl_archiver/archivers/nrel/nrelsiting.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/pudl_archiver/archivers/nrel/nrelsiting.py b/src/pudl_archiver/archivers/nrel/nrelsiting.py index a04b2c0d..f318e0b0 100644 --- a/src/pudl_archiver/archivers/nrel/nrelsiting.py +++ b/src/pudl_archiver/archivers/nrel/nrelsiting.py @@ -10,7 +10,6 @@ AbstractDatasetArchiver, ArchiveAwaitable, ResourceInfo, - retry_async, ) from pudl_archiver.frictionless import ZipLayout @@ -65,13 +64,10 @@ async def get_resources(self) -> ArchiveAwaitable: "format": "json", "s": "siting_lab", # The name of the lab's data we want } - response = await retry_async( - self.session.post, args=[url], kwargs={"data": data} - ) + data_dict = await self.get_json(url=url, post=True, data=data) # This returns a data dictionary containing metadata on # the number of submissions, files, the ID (xdrId) of the dataset # that corresponds to the Open EI link, the name, description and more. - data_dict = await response.json() data_dict = NrelAPIData(**data_dict) self.logger.info( From 7c9010fd5cbc86269567a8ac43353a573d3f9b83 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 18 Feb 2025 12:44:08 -0500 Subject: [PATCH 09/11] Add dataset descriptions as text files --- .../archivers/nrel/nrelsiting.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/pudl_archiver/archivers/nrel/nrelsiting.py b/src/pudl_archiver/archivers/nrel/nrelsiting.py index f318e0b0..33fe5bd1 100644 --- a/src/pudl_archiver/archivers/nrel/nrelsiting.py +++ b/src/pudl_archiver/archivers/nrel/nrelsiting.py @@ -2,6 +2,7 @@ import asyncio import re +from io import BytesIO from pydantic import BaseModel from pydantic.alias_generators import to_camel @@ -117,6 +118,24 @@ async def get_siting_resources(self, dataset: NrelAPIData.Submission): zip_path = self.download_directory / f"nrelsiting-{dataset_name}.zip" data_paths_in_archive = set() + # First, get dataset description from abstract class using bs4 + # Save to text file alongside other documentation + soup = await self.get_soup(dataset_link) + description = soup.select_one(".abstract") + # Add the link we archived from to the description. + description = ( + description.text + + f"\n\nThis data was archived from {dataset_link} by Catalyst Cooperative." + ) + description_bytes = description.encode("utf-8") + filename = f"{dataset_name}-description.txt" + self.add_to_archive( + zip_path=zip_path, + filename=filename, + blob=BytesIO(description_bytes), + ) + data_paths_in_archive.add(filename) + # First, get all the links from the page itself data_links = await self.compile_nrel_download_links( dataset_id=dataset_id, dataset_link=dataset_link From 7bc7ffd4a4b8dad17367fdbda190f6d09a798e09 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 18 Feb 2025 12:49:43 -0500 Subject: [PATCH 10/11] Clean up docstrings --- src/pudl_archiver/archivers/nrel/nrelsiting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/nrel/nrelsiting.py b/src/pudl_archiver/archivers/nrel/nrelsiting.py index 33fe5bd1..77e50132 100644 --- a/src/pudl_archiver/archivers/nrel/nrelsiting.py +++ b/src/pudl_archiver/archivers/nrel/nrelsiting.py @@ -136,7 +136,7 @@ async def get_siting_resources(self, dataset: NrelAPIData.Submission): ) data_paths_in_archive.add(filename) - # First, get all the links from the page itself + # Next, get all the links from the page itself data_links = await self.compile_nrel_download_links( dataset_id=dataset_id, dataset_link=dataset_link ) From 5650991a35ac73da50b656394837ce35627ddee2 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Mon, 24 Feb 2025 11:05:46 +0100 Subject: [PATCH 11/11] Add DOIs and add to GHA workflow --- .github/workflows/run-archiver.yml | 4 ++-- src/pudl_archiver/package_data/zenodo_doi.yaml | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml index a2959631..978a2c39 100644 --- a/.github/workflows/run-archiver.yml +++ b/.github/workflows/run-archiver.yml @@ -6,7 +6,7 @@ on: inputs: datasets: description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").' - default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","nrelefs","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' + default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","nrelefs","nrelsiting","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' required: true type: string create_github_issue: @@ -26,7 +26,7 @@ jobs: strategy: matrix: # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here. - dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","nrelefs","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }} + dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","nrelefs","nrelsiting","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }} fail-fast: false runs-on: ubuntu-latest permissions: diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index 4daaaee8..847b8778 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -95,7 +95,10 @@ nrelatb: sandbox_doi: 10.5072/zenodo.38192 nrelefs: production_doi: 10.5281/zenodo.14782873 - #sandbox_doi: TODO: Still having sandbox server problems, skipping for now. + sandbox_doi: 10.5072/zenodo.165948 +nrelsiting: + production_doi: 10.5281/zenodo.14888356 + #sandbox_doi: phmsagas: production_doi: 10.5281/zenodo.7683351 sandbox_doi: 10.5072/zenodo.45279