diff --git a/LICENSE b/LICENSE index 261eeb9..86dbdc3 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,8 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2023 California Institute of Technology (“Caltech”). + U.S. Government sponsorship acknowledged. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..0948fda --- /dev/null +++ b/environment.yml @@ -0,0 +1,13 @@ +channels: +- conda-forge +dependencies: +- python >= 3.8 +- geopandas +- numpy +- pandas +- pip +- shapely +- tqdm +- utm +- pip: + - unzip_http diff --git a/readme.md b/readme.md index ed3f3fb..e1fa429 100644 --- a/readme.md +++ b/readme.md @@ -10,19 +10,20 @@ Follow the steps below to install `burst_db` using conda environment. ```bash git clone https://github.com/opera-adt/burst_db burst_db +cd burst_db ``` 2. Install dependencies: ```bash -conda install -c conda-forge --file burst_db/requirements.txt +conda install --name burst_db -c conda-forge --file environment.yml ``` 3. Install `burst_db` via pip: ```bash # run "pip install -e" to install in development mode -python -m pip install ./burst_db +python -m pip install . ``` ## How to use @@ -37,6 +38,66 @@ python -m pip install ./burst_db - `sqlite_path_out` : Path to the output SQLite database file. +## Frame database information + +After running `pip install .` , the `opera-create-db` command will create the sqlite Frame Database, as well as JSON files which map the burst IDs to frame IDs, and frame IDs to burst IDs. + +The format of the frame-to-burst mapping is +```python +{ + "data" : { + "1": { + "epsg": 32631, + "is_land": False, + "is_north_america": False, + "xmin": 500160, + "ymin": 78240, + "xmax": 789960, + "ymax": 322740, + "burst_ids": [ + "t001_000001_iw1", + "t001_000001_iw2", + "t001_000001_iw3", + "t001_000002_iw1", + ... + "t001_000009_iw3" + ] + }, ... + }, + "metadata": { + "version": "0.1.2", "margin": 5000.0, ... + } +} +``` +where the keys of the the `data` dict are the frame IDs. + +The burst-to-frame mapping has the structure +```python +{ + "data" : { + "t001_000001_iw1": {"frame_ids": [1]}, + "t001_000001_iw2": {"frame_ids": [1]}, + ... + }, + "metadata": { + "version": "0.1.2", "margin": 5000.0, ... + } +} +``` +These data structures can be read into python using the function `build_frame_db.read_zipped_json` . + +The `opera-create-db` command also makes the full [Geopackage database](https://www.geopackage.org/) (which is based on sqlite), where the `burst_id_map` table contains the burst geometries, the `frames` table contains the frame geometries, and the `frames_bursts` table is the JOIN table for the many-to-many relationship. +An example SQL query to view all columns of these tables is +```sql +SELECT * +FROM frames f +JOIN frames_bursts fb ON fb.frame_fid = f.fid +JOIN burst_id_map b ON fb.burst_ogc_fid = b.ogc_fid +LIMIT 1; +``` +You can also drag the `opera-s1-disp.gpkg` file into QGIS to load the `frames` and `burst_id_map` tables to filter/view the geometries. + + ### License **Copyright (c) 2022** California Institute of Technology (“Caltech”). U.S. Government sponsorship acknowledged. diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 548c1e8..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -python>=3.9 # sentinel1-reader requirement -numpy # sentinel1-reader requirement -gdal>=3 -sqlite diff --git a/setup.py b/setup.py index 3227903..a180426 100644 --- a/setup.py +++ b/setup.py @@ -1,35 +1,38 @@ -''' +""" setup.py for OPERA burst database generator -''' +""" import os +import sys -from setuptools import setup +from setuptools import find_packages, setup -__version__ = VERSION = '0.1.0' +# taken from mintpy: https://github.com/insarlab/MintPy/blob/main/setup.py +# Grab version and description from version.py +# link: https://stackoverflow.com/questions/53648900 +sys.path.append(os.path.join(os.path.dirname(__file__), "src")) +from burst_db.version import release_version -LONG_DESCRIPTION = 'Burst database for OPERA SAS' - -package_data_dict = {} - -package_data_dict['rtc'] = [ - os.path.join('defaults', 'rtc_s1.yaml'), - os.path.join('schemas', 'rtc_s1.yaml')] +LONG_DESCRIPTION = "Sentinel-1 Burst database for OPERA SAS" setup( - name = 'burst_db', - version = VERSION, - description = 'Burst database for OPERA SAS', - package_dir = {'burst_db': 'src/burst_db'}, - include_package_data = True, - package_data = package_data_dict, - classifiers = ['Programming Language :: Python'], - #scripts = ['app/rtc_s1.py'], - install_requires = ['argparse', 'numpy', 'gdal'], - url = 'https://github.com/opera-adt/burst_db', - author = ('Seongsu Jeong'), - author_email = ('seongsu.jeong@jpl.nasa.gov'), - license = ('Copyright by the California Institute of Technology.' - ' ALL RIGHTS RESERVED.'), - long_description=LONG_DESCRIPTION + name="burst_db", + version=release_version, + description="Burst database for OPERA SAS", + packages=find_packages("src"), # include all packages under src + package_dir={"": "src"}, # tell distutils packages are under src + classifiers=["Programming Language :: Python"], + url="https://github.com/opera-adt/burst_db", + author="Seongsu Jeong; Scott J. Staniewicz", + author_email="seongsu.jeong@jpl.nasa.gov; scott.j.staniewicz@jpl.nasa.gov", + license=( + "Copyright by the California Institute of Technology. ALL RIGHTS RESERVED." + ), + long_description=LONG_DESCRIPTION, + # Add console scripts here + entry_points={ + "console_scripts": [ + "opera-create-db = burst_db.build_frame_db:main", + ], + }, ) diff --git a/src/burst_db/_esa_burst_db.py b/src/burst_db/_esa_burst_db.py new file mode 100644 index 0000000..5bd0c05 --- /dev/null +++ b/src/burst_db/_esa_burst_db.py @@ -0,0 +1,29 @@ +''' +An internal module to download the ESA burst database +''' +import os +import shutil +import subprocess +import tempfile +import zipfile + +ESA_DB_URL = "https://sar-mpc.eu/files/S1_burstid_20220530.zip" + + +def get_esa_burst_db(output_path="burst_map_IW_000001_375887.sqlite3"): + """Download the ESA burst database.""" + print(f"Downloading ESA burst database from {ESA_DB_URL} to {output_path}.") + db_filename = "S1_burstid_20220530/IW/sqlite/burst_map_IW_000001_375887.sqlite3" + cur_dir = os.getcwd() + output_path = os.path.abspath(output_path) + with tempfile.TemporaryDirectory() as tmpdir: + try: + os.chdir(tmpdir) + subprocess.check_call(["wget", ESA_DB_URL]) + + with zipfile.ZipFile(ESA_DB_URL.split("/")[-1], "r") as zip_ref: + zip_ref.extract(db_filename) + shutil.move(db_filename, output_path) + shutil.rmtree(db_filename.split("/")[0]) + finally: + os.chdir(cur_dir) diff --git a/src/burst_db/_land_usgs.py b/src/burst_db/_land_usgs.py new file mode 100644 index 0000000..77b20b2 --- /dev/null +++ b/src/burst_db/_land_usgs.py @@ -0,0 +1,108 @@ +"""An internal module to download shape files for land are and Greenland.""" +import fnmatch +import zipfile +from pathlib import Path + +import geopandas as gpd +import pandas as pd +import requests +import unzip_http +from shapely.geometry import MultiPolygon + +USGS_LAND_URL = "https://www.ngdc.noaa.gov/mgg/shorelines/data/gshhg/latest/gshhg-shp-2.3.7.zip" # noqa +GREENLAND_URL = "https://stacks.stanford.edu/file/druid:sd368wz2435/data.zip" # noqa + + +def get_usgs_land(outpath=None): + """Get the USGS land data from the following url: + + https://www.ngdc.noaa.gov/mgg/shorelines/data/gshhg/latest/gshhg-shp-2.3.7.zip + """ + outpath = Path(outpath) if outpath else Path.cwd() + rzf = unzip_http.RemoteZipFile(USGS_LAND_URL) + # Level 1: Continental land masses and ocean islands, except Antarctica. + # Level 6: Antarctica based on grounding line boundary. + paths = ["GSHHS_shp/h/GSHHS_h_L1.*", "GSHHS_shp/h/GSHHS_h_L6.*"] + shp_files = [] + dfs = [] + for fn in rzf.infolist(): + if not any(fnmatch.fnmatch(fn.filename, g) for g in paths): + continue + outname = outpath / fn.filename + if outname.suffix == (".shp"): + shp_files.append(outname) + if not outname.exists(): + outname.parent.mkdir(parents=True, exist_ok=True) + with rzf.open(fn) as fp, open(outname, "wb") as fout: + print(f"Extracting {fn.filename} to {outname}") + while r := fp.read(2**18): + fout.write(r) + for p in shp_files: + dfs.append(gpd.read_file(p)) + return dfs + + +def get_land_df( + buffer_deg=0.2, + outname="usgs_land_{d}deg_buffered.geojson", + driver="GeoJSON", + do_zip=True, +) -> gpd.GeoDataFrame: + """Create a GeoDataFrame of the (buffered) USGS land polygons.""" + outname = outname.format(d=buffer_deg) + if outname and Path(outname).exists(): + print(f"Loading {outname} from disk") + return gpd.read_file(outname) + elif Path(outname + ".zip").exists(): + print(f"Loading {outname}.zip from disk") + return gpd.read_file(str(outname) + ".zip") + + # If we haven't already made the file, make it + df_land_cont, df_antarctica = get_usgs_land() + df_land = pd.concat([df_land_cont, df_antarctica], axis=0)[["geometry"]] + df_land.geometry = df_land.geometry.buffer(buffer_deg) + df_land = df_land.dissolve() + + df_land.to_file(outname, driver=driver) + if do_zip and outname.endswith(".geojson"): + outname_zipped = Path(str(outname) + ".zip") + # zip and remove the original + with zipfile.ZipFile( + outname_zipped, "w", compression=zipfile.ZIP_DEFLATED + ) as zf: + zf.write(outname) + + # Remove the original + Path(outname).unlink() + + return df_land + + +def get_greenland_shape(outpath=None, buffer_deg=0.2) -> MultiPolygon: + """Get the Greenland data from the following URL: + + https://stacks.stanford.edu/file/druid:sd368wz2435/data.zip + """ + outpath = Path(outpath) if outpath else Path.cwd() + outname = outpath / f"greenland_{buffer_deg}deg_buffered.geojson" + if outname.exists(): + print(f"Loading {outname} from disk") + return gpd.read_file(outname).iloc[0].geometry + + # download the whole greenland shapefile + print("Downloading Greenland shapefile...") + r = requests.get(GREENLAND_URL) + zipfile = outpath / "greenland.zip" + if not zipfile.exists(): + with open(zipfile, "wb") as fout: + fout.write(r.content) + + df = gpd.read_file(zipfile) + print("Simplifying and buffering Greenland shapefile...") + g = df.iloc[0].geometry + # Now simplify first, then buffer + gs = g.simplify(0.1) + g_buffered = gs.buffer(buffer_deg) + # Save for later + gpd.GeoDataFrame(geometry=[g_buffered]).to_file(outname, driver="GeoJSON") + return g_buffered diff --git a/src/burst_db/_opera_north_america.py b/src/burst_db/_opera_north_america.py new file mode 100644 index 0000000..ce63420 --- /dev/null +++ b/src/burst_db/_opera_north_america.py @@ -0,0 +1,17 @@ +"""Module to read the OPERA North America shape. + +Data comes from: +https://github.com/OPERA-Cal-Val/DSWx-Validation-Experiments/blob/7f06ab98cf43135eb63e5a29593235dbebcb19fa/marshak/north_america_boundaries/north_america_opera.geojson +""" +from pathlib import Path + +import geopandas as gpd +from shapely import GeometryType + + +def get_opera_na_shape() -> GeometryType.MULTIPOLYGON: + """Read the OPERA North America geometry as a shapely `multipolygon`.""" + filename = Path(__file__).parent / "data" / "north_america_opera.geojson.zip" + na_gpd = gpd.read_file(filename) + # Combine all geometries in the GeoDataFrame into one MultiPolygon + return na_gpd.geometry.unary_union diff --git a/src/burst_db/build_frame_db.py b/src/burst_db/build_frame_db.py new file mode 100755 index 0000000..cb03b29 --- /dev/null +++ b/src/burst_db/build_frame_db.py @@ -0,0 +1,754 @@ +#!/usr/bin/env python +import argparse +import datetime +import json +import sqlite3 +import time +import zipfile +from pathlib import Path + +import geopandas as gpd +import numpy as np +import pandas as pd +import utm # https://github.com/Turbo87/utm +from shapely import GeometryType, STRtree +from shapely.affinity import translate +from tqdm.auto import tqdm + +from burst_db import __version__ + +from . import frames +from ._esa_burst_db import ESA_DB_URL, get_esa_burst_db +from ._land_usgs import GREENLAND_URL, USGS_LAND_URL, get_greenland_shape, get_land_df +from ._opera_north_america import get_opera_na_shape + +# Threshold to use EPSG:3413, Sea Ice Polar North (https://epsg.io/3413) +NORTH_THRESHOLD = 75 +NORTH_EPSG = 3413 +# Threshold to use EPSG:3031, Antarctic Polar Stereographic (https://epsg.io/3031) +SOUTH_THRESHOLD = -60 +SOUTH_EPSG = 3031 + + +def make_jpl_burst_id(df: pd.DataFrame): + """Make the JPL burst ID from the ESA burst ID.""" + burst_id_jpl = ( + "t" + + df["relative_orbit_number"].astype(str).str.zfill(3) + + "_" + + df["burst_id"].astype(str).str.zfill(6) + + "_" + + df["subswath_name"].str.lower() + ) + return burst_id_jpl + + +def _setup_spatialite_con(con: sqlite3.Connection): + """Enable spatialite and load the spatialite extension.""" + con.enable_load_extension(True) + # Try the two versions, mac and linux with .so + try: + con.load_extension("mod_spatialite") + except: + con.load_extension("mod_spatialite.so") + # Allow us to use spatialite functions in on GPKG files. + # https://medium.com/@joelmalone/sqlite3-spatialite-and-geopackages-66a08485da6c + con.execute("SELECT EnableGpkgAmphibiousMode();") + + +def make_burst_triplets(df_burst: pd.DataFrame) -> pd.DataFrame: + """Make a burst triplets dataframe, aggregating IW1,2,3 from the burst dataframe.""" + + def join_track_numbers(orbits: list) -> str: + orbits = list(set(orbits)) + orbits_str = list(map(str, orbits)) + return ",".join(orbits_str) + + df_burst_triplet_temp = df_burst.dissolve( + by="burst_id", + aggfunc={ + "OGC_FID": ["min", "max"], + "relative_orbit_number": join_track_numbers, + "orbit_pass": "first", + }, + as_index=False, + ) + df_burst_triplet = df_burst_triplet_temp.reset_index(drop=True) + df_burst_triplet.columns = [ + "burst_id", + "geom", + "OGC_FID_min", + "OGC_FID_max", + "relative_orbit_numbers", + "look_direction", + ] + return df_burst_triplet + + +def get_land_indicator(gdf: gpd.GeoDataFrame, land_geom: GeometryType.POLYGON): + """Get a boolean array indicating if each row of `gdf` intersects `land_geom`.""" + tree = STRtree(gdf.geometry) + idxs_land = tree.query(land_geom, predicate="intersects") + if idxs_land.ndim == 2: + idxs_land = idxs_land[1] + is_in_land = gdf.index.isin(idxs_land) + return is_in_land + + +def make_frame_to_burst_table(outfile: str, df_frame_to_burst_id: pd.DataFrame): + """Create the frames_bursts table and indexes.""" + with sqlite3.connect(outfile) as con: + _setup_spatialite_con(con) + + df_frame_to_burst_id.to_sql("frames_bursts", con, if_exists="replace") + con.execute( + "CREATE INDEX IF NOT EXISTS idx_frames_bursts_burst_ogc_fid ON" + " frames_bursts (burst_ogc_fid)" + ) + con.execute( + "CREATE INDEX IF NOT EXISTS idx_frames_bursts_frame_fid ON frames_bursts" + " (frame_fid)" + ) + con.execute( + "CREATE INDEX IF NOT EXISTS idx_burst_id_map_burst_id ON burst_id_map" + " (burst_id)" + ) + + +def make_frame_table(outfile: str): + """Create the frames table and indexes.""" + with sqlite3.connect(outfile) as con: + _setup_spatialite_con(con) + con.execute( + "CREATE TABLE frames (fid INTEGER PRIMARY KEY, epsg INTEGER, " + "is_land INTEGER, is_north_america INTEGER)" + ) + # https://groups.google.com/g/spatialite-users/c/XcWvAk7vg0c + # should add geom after the table is created + # table_name , geometry_column_name , geometry_type , with_z , with_m , srs_id + con.execute( + "SELECT gpkgAddGeometryColumn('frames', 'geom', 'MULTIPOLYGON', 2, 0," + " 4326);" + ) + + # Aggregates burst geometries for each frame into one + con.execute( + """INSERT INTO frames(fid, is_land, geom) + SELECT fb.frame_fid as fid, + fb.is_land, + ST_UnaryUnion(ST_Collect(geom)) as geom + FROM burst_id_map b + JOIN + frames_bursts fb + ON b.ogc_fid = fb.burst_ogc_fid + GROUP BY 1; + """ + ) + print("Creating indexes and spatial index...") + con.execute("CREATE INDEX IF NOT EXISTS idx_frames_fid ON frames (fid)") + con.execute("SELECT gpkgAddSpatialIndex('frames', 'geom') ;") + # Extra thing so that QGIS recognizes "frames" better + con.execute( + "UPDATE gpkg_geometry_columns SET geometry_type_name = 'MULTIPOLYGON';" + ) + # Set the relative_orbit_number as the most common value for each frame + con.execute("ALTER TABLE frames ADD COLUMN relative_orbit_number INTEGER;") + con.execute( + """WITH frame_tracks AS ( + SELECT f.fid, + CAST(ROUND(AVG(b.relative_orbit_number)) AS INTEGER) AS relative_orbit_number + FROM frames f + JOIN frames_bursts fb ON f.fid = fb.frame_fid + JOIN burst_id_map b ON fb.burst_ogc_fid = b.ogc_fid + GROUP BY 1 + ) UPDATE frames SET relative_orbit_number = frame_tracks.relative_orbit_number + FROM frame_tracks + WHERE frames.fid = frame_tracks.fid; + """ + ) + # Set the orbit_pass as the value from the first burst + con.execute("ALTER TABLE frames ADD COLUMN orbit_pass TEXT;") + con.execute( + """WITH op AS + (SELECT f.fid, + b.orbit_pass + FROM frames f + JOIN frames_bursts fb ON f.fid = fb.frame_fid + JOIN burst_id_map b ON fb.burst_ogc_fid = b.ogc_fid), + frame_orbits AS ( + SELECT fid, + FIRST_VALUE(orbit_pass) OVER (PARTITION BY fid) AS orbit_pass + FROM op + GROUP BY fid) + UPDATE frames SET orbit_pass = frame_orbits.orbit_pass + FROM frame_orbits + WHERE frames.fid = frame_orbits.fid; + """ + ) + + # Drop the is_land from the frames_bursts table + con.execute("ALTER TABLE frames_bursts DROP COLUMN is_land;") + + +def get_epsg_codes(df: gpd.GeoDataFrame): + """Get the EPSG codes for all non-antimeridian polygons in a GeoDataFrame. + + Uses the UTM library to account for the oddities of the Zones near Norway [1]_. + + References + ---------- + .. _[1]: http://www.jaworski.ca/utmzones.htm + """ + + def _is_on_antimeridian(geom): + return geom.geom_type == "MultiPolygon" and len(geom.geoms) > 1 + + epsgs = np.zeros(len(df), dtype=int) + + # do the antimeridian frames first + am_idxs = df.geometry.map(_is_on_antimeridian).values + epsgs[am_idxs] = df[am_idxs].geometry.map(antimeridian_epsg) + + # everything else + # get the x, y (lon, lat) coords of all other rows + other_coords = np.array( + df[~am_idxs].geometry.map(lambda g: tuple(g.centroid.coords)[0]).tolist() + ) + xs, ys = other_coords.T + ys_full_size = np.ones(len(epsgs)) * np.nan + ys_full_size[~am_idxs] = ys + + idxs = np.logical_and.reduce((~am_idxs, ys_full_size > NORTH_THRESHOLD)) + epsgs[idxs] = NORTH_EPSG + + idxs = np.logical_and.reduce((~am_idxs, ys_full_size < SOUTH_THRESHOLD)) + epsgs[idxs] = SOUTH_EPSG + + utm_idxs = np.logical_and(ys < NORTH_THRESHOLD, ys > SOUTH_THRESHOLD) + north_idxs = ys[utm_idxs] > 0 + + # North hemisphere + zones_north = [ + utm.from_latlon(y, x)[2] + for (y, x) in zip(ys[utm_idxs][north_idxs], xs[utm_idxs][north_idxs]) + ] + idxs = np.logical_and.reduce( + (~am_idxs, ys_full_size < NORTH_THRESHOLD, ys_full_size > 0) + ) + epsgs[idxs] = 32600 + np.array(zones_north) + + # South hemisphere + zones_south = [ + utm.from_latlon(y, x)[2] + for (y, x) in zip(ys[utm_idxs][~north_idxs], xs[utm_idxs][~north_idxs]) + ] + idxs = np.logical_and.reduce( + (~am_idxs, ys_full_size > SOUTH_THRESHOLD, ys_full_size < 0) + ) + epsgs[idxs] = 32700 + np.array(zones_south) + + # Set all Greenland frames to EPSG:3413 + geom_greenland = get_greenland_shape() + is_in_greenland = get_land_indicator(df, geom_greenland) + print( + f"{is_in_greenland.sum()} frames are in Greenland. Setting to EPSG:{NORTH_EPSG}" + ) + epsgs[is_in_greenland] = NORTH_EPSG + + return epsgs + + +def antimeridian_epsg(mp): + """Calculate the EPSG of multipolygons along the antimeridian. + + Parameters + ---------- + mp : shapely.geometry.MultiPolygon + The multipolygon to calculate the EPSG for. + + Returns + ------- + epsg : int + The EPSG code for the multipolygon. + + Notes + ----- + + The EPSG code is calculated by taking the weighted average of the centroid of the + polygons in the multipolygon. The centroid is weighted by the area of the polygon. + The centroid is shifted by 360 degrees if it is in the western hemisphere. + """ + y_c = mp.centroid.y + # check north/south pole cases + if y_c >= NORTH_THRESHOLD: + return NORTH_EPSG + elif y_c <= SOUTH_THRESHOLD: + return SOUTH_EPSG + + # otherwise, do the weighted average of the shifted polygons to get the centroid + A = 0 + x_weighted = 0 + # might have 2 or 3 polygons + for g in mp.geoms: + A += g.area + if g.centroid.x < 0: + g_shifted = translate(g, xoff=360) + x_weighted += g_shifted.centroid.x * g.area + else: + x_weighted += g.centroid.x * g.area + x_c = x_weighted / A + + # Northern hemisphere = 326XX, southern is 327XX + base = 32600 if y_c > 0 else 32700 + # EPSG increases negative to positive (west to east) + # 32601 is at longitude -179 (which 181 after shifting +360) + zone_addition = 1 if x_c > 180 else 60 + return base + zone_addition + + +def update_burst_epsg(outfile): + """Update the EPSG of each burst to match the EPSG of the frame it is in.""" + with sqlite3.connect(outfile) as con: + _setup_spatialite_con(con) + # add index + con.execute( + "CREATE INDEX IF NOT EXISTS idx_burst_id_map_burst_id_jpl ON burst_id_map" + " (burst_id_jpl)" + ) + # Set the EPSG on every burst from the frames + print("Updating burst EPSGs to match frames...") + sql = """WITH burst_epsgs AS ( + SELECT b.OGC_FID, + f.epsg + FROM burst_id_map b + JOIN frames_bursts fb + ON b.OGC_FID = fb.burst_ogc_fid + JOIN frames f + ON fb.frame_fid = f.fid + ) + UPDATE burst_id_map + SET epsg = burst_epsgs.epsg + FROM burst_epsgs + WHERE burst_id_map.OGC_FID = burst_epsgs.OGC_FID; + """ + con.execute(sql) + + +def add_gpkg_spatial_ref_sys(outfile): + """Add all EPSG codes to the gpkg_spatial_ref_sys table.""" + north_hemi_utm = list(range(32601, 32661)) + south_hemi_utm = list(range(32701, 32761)) + epsgs = [SOUTH_EPSG, NORTH_EPSG, 4326] + north_hemi_utm + south_hemi_utm + with sqlite3.connect(outfile) as con: + _setup_spatialite_con(con) + sql = "SELECT gpkgInsertEpsgSRID({epsg});" + for epsg in tqdm(epsgs): + try: + con.execute(sql.format(epsg=epsg)) + except (sqlite3.OperationalError, sqlite3.IntegrityError): + # exists + pass + # Fix the gpkg_spatial_ref_sys table for missing UTM zone 32760 + # https://www.gaia-gis.it/fossil/libspatialite/tktview/8b6910dbbb2180026af54a5cc5aac107fb1d62ad?plaintext + sql = """INSERT INTO gpkg_spatial_ref_sys ( + srs_name, + srs_id, + organization, + organization_coordsys_id, + definition + ) + VALUES ( + 'WGS 84 / UTM zone 60S', + 32760, + 'EPSG', + 32760, +'PROJCS["WGS 84 / UTM zone 60S",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",177],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",10000000],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","32760"]]' + ); + """ + try: + con.execute(sql) + except (sqlite3.OperationalError, sqlite3.IntegrityError): + # exists + pass + # More the entries from gpkg_spatial_ref_sys to spatial_ref_sys + # so we can use the `ST_Transform` function + con.execute("DROP TABLE IF EXISTS spatial_ref_sys;") + sql = """ + CREATE TABLE spatial_ref_sys ( + srid INTEGER NOT NULL PRIMARY KEY, + auth_name VARCHAR(256), + auth_srid INTEGER, + srtext VARCHAR(2048), + proj4text VARCHAR(2048) + );""" + con.execute(sql) + sql = """ + INSERT INTO spatial_ref_sys + SELECT + srs_id AS srid, + organization AS auth_name, + organization_coordsys_id AS auth_srid, + definition AS srtext, + NULL + FROM gpkg_spatial_ref_sys; + """ + con.execute(sql) + + +def save_utm_bounding_boxes(outfile, *, margin: float, snap: float): + """Save the bounding boxes of each burst in UTM coordinates.""" + print("Saving UTM bounding boxes...") + try: + with sqlite3.connect(outfile) as con: + for col in ["xmin", "ymin", "xmax", "ymax"]: + con.execute(f"ALTER TABLE burst_id_map ADD COLUMN {col} INTEGER;") + except sqlite3.OperationalError: + # Already exists + pass + + sql = f""" +WITH transformed(g, OGC_FID) AS + (SELECT ST_Envelope(ST_Transform(geom, epsg)) g, + OGC_FID + FROM burst_id_map + WHERE epsg != 0 ) +UPDATE burst_id_map +SET (xmin, + ymin, + xmax, + ymax) = (bboxes.xmin, + bboxes.ymin, + bboxes.xmax, + bboxes.ymax) +FROM + (SELECT OGC_FID, + FLOOR((ST_MinX(g) - {margin}) / {snap:.1f}) * {snap:.1f} AS xmin, + FLOOR((ST_MinY(g) - {margin}) / {snap:.1f}) * {snap:.1f} AS ymin, + CEIL((ST_MaxX(g) + {margin}) / {snap:.1f}) * {snap:.1f} AS xmax, + CEIL((ST_MaxY(g) + {margin}) / {snap:.1f}) * {snap:.1f} AS ymax + FROM transformed) AS bboxes +WHERE burst_id_map.OGC_FID = bboxes.OGC_FID ; + """ + with sqlite3.connect(outfile) as con: + _setup_spatialite_con(con) + con.execute(sql) + + +def make_minimal_db(db_path, df_frame_to_burst_id, output_path): + """Make a minimal database with only the following columns: + + burst_id_jpl, epsg, xmin, ymin, ymax, ymax, frame_ids + and a zip file with: + {burst_id_jpl: (epsg, xmin, ymin, ymax, ymax, frame_ids),...} + """ + with sqlite3.connect(db_path) as con: + df = pd.read_sql_query( + ( + "SELECT OGC_FID, burst_id_jpl, epsg, xmin, ymin, xmax, ymax FROM" + " burst_id_map" + ), + con, + ) + # Make sure snapped coordinates as integers (~40% smaller than REAL) + df["xmin"] = df["xmin"].astype(int) + df["ymin"] = df["ymin"].astype(int) + df["xmax"] = df["xmax"].astype(int) + df["ymax"] = df["ymax"].astype(int) + + with sqlite3.connect(output_path) as con: + df.to_sql("burst_id_map", con, if_exists="replace", index=False) + + # Make a version which has the list of frames each burst belongs to + df_burst_to_frames = _get_burst_to_frame_list(df_frame_to_burst_id) + df_out = pd.merge( + df, df_burst_to_frames, how="inner", left_on="OGC_FID", right_on="burst_ogc_fid" + ) + df_out.drop(columns="OGC_FID", inplace=True) + return df_out + + +def make_burst_to_frame_json(df, output_path: str, metadata: dict): + data_dict = ( + df.set_index("burst_id_jpl")[["frame_fid"]] + .rename(columns={"frame_fid": "frame_ids"}) + .to_dict(orient="index") + ) + # Format: {'t001_000001_iw1': [1], ...} + dict_out = {"data": data_dict, "metadata": metadata} + _write_zipped_json(output_path, dict_out) + + +def make_frame_to_burst_json(db_path: str, output_path: str, metadata: dict): + with sqlite3.connect(db_path) as con: + df_frame_to_burst = pd.read_sql_query( + """ + SELECT + f.fid AS frame_id, + f.epsg, + f.is_land, + f.is_north_america, + MIN(xmin) AS xmin, + MIN(ymin) AS ymin, + MAX(xmax) AS xmax, + MAX(ymax) AS ymax, + GROUP_CONCAT(burst_id_jpl) AS burst_ids + FROM frames f + JOIN frames_bursts fb ON fb.frame_fid = f.fid + JOIN burst_id_map b ON fb.burst_ogc_fid = b.ogc_fid + GROUP BY 1; + """, + con, + ) + df_frame_to_burst.burst_ids = df_frame_to_burst.burst_ids.str.split(",") + df_frame_to_burst.is_land = df_frame_to_burst.is_land.astype(bool) + df_frame_to_burst.is_north_america = df_frame_to_burst.is_north_america.astype(bool) + data_dict = df_frame_to_burst.set_index("frame_id").to_dict(orient="index") + dict_out = {"data": data_dict, "metadata": metadata} + + _write_zipped_json(output_path, dict_out) + + +def _write_zipped_json(json_path, dict_out, level: int = 6): + json_zip_path = str(json_path) + ".zip" + with zipfile.Zipfile( + json_zip_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=level + ) as zf: + zf.writestr(json_path, json.dumps(dict_out)) + + +def _get_burst_to_frame_list(df_frame_to_burst_id): + """Get a DataFrame which maps the burst ID to the list of frames + containing the burst. + + Example: + frame_fid + burst_ogc_fid + ... + 24 [1] + 25 [1, 2] + 26 [1, 2] + 27 [1, 2] + 28 [2] + """ + return ( + df_frame_to_burst_id[["burst_ogc_fid", "frame_fid"]] + .groupby("burst_ogc_fid") + .agg(list) + ) + + +def _get_metadata(args): + base = { + "margin": args.margin, + "snap": args.snap, + "target_frame_size": args.target_frame, + "version": __version__, + "last_modified": datetime.datetime.now().isoformat(), + "land_buffer_deg": args.land_buffer_deg, + "land_optimized": args.optimize_land, + "usgs_land_shape_url": USGS_LAND_URL, + "greenland_shape_url": GREENLAND_URL, + "esa_burst_db_url": ESA_DB_URL, + } + if args.optimize_land: + base["min_frame_size"] = args.min_frame + base["max_frame_size"] = args.max_frame + return base + + +def create_metadata_table(db_path, metadata): + """Make the metadata table with the arguments used to create the database.""" + df = pd.DataFrame([metadata]) + with sqlite3.connect(db_path) as con: + df.to_sql("metadata", con, if_exists="replace", index=False) + + +def read_zipped_json(filename): + with zipfile.ZipFile(filename) as zf: + bytes = zf.read(str(Path(filename).name).replace(".zip", "")) + return json.loads(bytes.decode()) + + +def get_cli_args(): + parser = argparse.ArgumentParser( + description="Generate frames for Sentinel-1 data", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--esa-db-path", + default="burst_map_IW_000001_375887.sqlite3", + help=( + "Path to the ESA sqlite burst database to convert, " + f"downloaded from {ESA_DB_URL} . Will be downloaded if not exists." + ), + ) + parser.add_argument( + "--snap", + type=float, + default=30.0, + help="Snap the bounding box to the nearest multiple of this value.", + ) + parser.add_argument( + "--margin", + type=float, + default=5000.0, + help="Add this margin surrounding the bounding box of bursts.", + ) + parser.add_argument( + "--optimize-land", + action="store_true", + help=( + "Create frames which attempt to minimize the number of majority-water" + " frames" + ), + ) + parser.add_argument( + "--target-frame", + type=int, + default=9, + help="Target number of bursts per frame.", + ) + parser.add_argument( + "--min-frame", + type=int, + default=5, + help="(If using `--optimize-land`): Minimum number of bursts per frame", + ) + parser.add_argument( + "--max-frame", + type=int, + default=10, + help="(If using `--optimize-land`): Maximum number of bursts per frame", + ) + parser.add_argument( + "-o", + "--outfile", + default="opera-s1-disp.gpkg", + help="Output file name", + ) + parser.add_argument( + "--land-buffer-deg", + type=float, + default=0.3, + help="A buffer (in degrees) to indicate that a frame is near land.", + ) + + return parser.parse_args() + + +def main(): + args = get_cli_args() + + t0 = time.time() + + # Read ESA's Burst Data + if not Path(args.esa_db_path).exists(): + print(f"Downloading {ESA_DB_URL} to {args.esa_db_path}...") + get_esa_burst_db(args.esa_db_path) + + print("Loading burst data...") + sql = "SELECT * FROM burst_id_map" + with sqlite3.connect(args.esa_db_path) as con: + df_burst = gpd.GeoDataFrame.from_postgis( + sql, con, geom_col="GEOMETRY", crs="EPSG:4326" + ).rename_geometry("geom") + + print("Forming string JPL id") + jpl_ids = make_jpl_burst_id(df_burst) + df_burst.loc[:, "burst_id_jpl"] = jpl_ids + # placeholder to compute later + df_burst.loc[:, "epsg"] = 0 + + # Start the outfile with the ESA database contents + outfile = args.outfile + print("Saving initial version of `burst_id_map` table") + df_burst.set_index("OGC_FID").to_file( + outfile, driver="GPKG", layer="burst_id_map", index=False + ) + # Adjust the primary key so it still matches original OGC_FID + print("Renaming index column from 'fid' to 'OGC_FID'") + with sqlite3.connect(outfile) as con: + con.execute("ALTER TABLE burst_id_map RENAME COLUMN fid TO OGC_FID;") + + print("Aggregating burst triplets (grouping IW1,2,3 geometries together)") + df_burst_triplet = make_burst_triplets(df_burst) + + # Get the land polygon to intersect + print("Indicating which burst triplets are near land...") + df_land = get_land_df(args.land_buffer_deg) + land_geom = df_land.geometry + + is_in_land = get_land_indicator(df_burst_triplet, land_geom) + + # Create frames and JOIN tables + # Make the JOIN table first + print("Defining frames - bursts JOIN table") + df_frame_to_burst_id = frames.create_frame_to_burst_mapping( + is_in_land, + target_frame=args.target_frame, + min_frame=args.min_frame, + max_frame=args.max_frame, + optimize_land=args.optimize_land, + ) + make_frame_to_burst_table(outfile, df_frame_to_burst_id) + + # make the "frames" table + print("Making frames table by aggregating burst geometries...") + make_frame_table(outfile) + df_frames = gpd.read_file(outfile, layer="frames") + + print("Computing EPSG codes for each frame...") + epsgs = get_epsg_codes(df_frames) + df_frames.loc[:, "epsg"] = epsgs + + # Mark the ones in north america in the OPERA region of interest + geom_north_america = get_opera_na_shape() + is_in_north_america = get_land_indicator(df_frames, geom_north_america) + df_frames.loc[:, "is_north_america"] = is_in_north_america + + print("Final number of frames:", len(df_frames)) + print("Number intersecting land:", is_in_land.sum()) + print("Number in North America:", is_in_north_america.sum()) + print("Saving frames...") + df_frames.to_file(outfile, driver="GPKG", layer="frames") + + update_burst_epsg(outfile) + + # Create the bounding box in UTM coordinates + add_gpkg_spatial_ref_sys(outfile) + save_utm_bounding_boxes(outfile, margin=args.margin, snap=args.snap) + + # Make the minimal version of the DB + ext = Path(outfile).suffix + out_minimal = outfile.replace(ext, f"-bbox-only{ext}") + out_burst_to_frame = outfile.replace(ext, "-burst-to-frame.json") + print(f"Creating a epsg/bbox only version: {out_minimal}") + df_minimal = make_minimal_db( + db_path=outfile, + df_frame_to_burst_id=df_frame_to_burst_id, + output_path=out_minimal, + ) + + # Get metadata for output dbs + metadata = _get_metadata(args) + # Create the two JSON mappings: + # from frame id -> [burst Ids] + # and burst id -> [frame Ids] + make_burst_to_frame_json( + df_minimal, output_path=out_burst_to_frame, metadata=metadata + ) + + out_frame_to_burst = outfile.replace(ext, "-frame-to-burst.json") + make_frame_to_burst_json( + db_path=outfile, output_path=out_frame_to_burst, metadata=metadata + ) + + # Add metadata to each + create_metadata_table(outfile, metadata=metadata) + create_metadata_table(out_minimal, metadata=metadata) + + print(f"Total time: {time.time() - t0:.2f} seconds") + + +if __name__ == "__main__": + main() diff --git a/src/burst_db/data/north_america_opera.geojson.zip b/src/burst_db/data/north_america_opera.geojson.zip new file mode 100644 index 0000000..be593f7 Binary files /dev/null and b/src/burst_db/data/north_america_opera.geojson.zip differ diff --git a/src/burst_db/frames.py b/src/burst_db/frames.py new file mode 100644 index 0000000..9f19727 --- /dev/null +++ b/src/burst_db/frames.py @@ -0,0 +1,251 @@ +from __future__ import annotations + +import math +from collections import Counter, namedtuple +from concurrent.futures import ProcessPoolExecutor +from functools import lru_cache +from itertools import groupby, repeat + +import numpy as np +import pandas as pd +from numpy.typing import ArrayLike +from tqdm.auto import tqdm + +MIN_FRAME = 5 +MAX_FRAME = 12 +TARGET_FRAME = 10 + +FrameSlice = namedtuple("FrameSlice", ["start_idx", "end_idx", "is_land"]) + + +def create_frame_to_burst_mapping( + burst_is_in_land: ArrayLike, + target_frame: int, + min_frame: int, + max_frame: int, + optimize_land: bool = False, +) -> pd.DataFrame: + """Create the JOIN table between frames_number and burst_id.""" + if not optimize_land: + cumulative_slice_idxs = make_simple_frame_slices(burst_is_in_land) + else: + cumulative_slice_idxs = make_land_optimized_frame_slices( + burst_is_in_land, + target_frame=target_frame, + min_frame=min_frame, + max_frame=max_frame, + ) + + # Create the frame IDs mapping to burst_id + # (frame_id, OGC_FID) + frame_ogc_fid_tuples = [] + for frame_id, (start_idx, end_idx, is_land) in enumerate( + cumulative_slice_idxs, start=1 + ): + for burst_id in range(start_idx + 1, end_idx + 1): + for ogc_fid in range(1 + 3 * (burst_id - 1), 4 + 3 * (burst_id - 1)): + frame_ogc_fid_tuples.append((frame_id, ogc_fid, is_land)) + + df_frame_to_burst_id = pd.DataFrame( + frame_ogc_fid_tuples, columns=["frame_fid", "burst_ogc_fid", "is_land"] + ) + return df_frame_to_burst_id + + +def make_simple_frame_slices( + burst_is_in_land: ArrayLike, n_bursts_per_frame: int = 9, overlap: int = 1 +) -> list[FrameSlice]: + """Group together adjacent `n_bursts_per_frame`, overlapping by `overlap`.""" + num_bursts = len(burst_is_in_land) + num_frames = int(np.ceil(num_bursts / (n_bursts_per_frame - overlap))) + burst_start_idxs = [k * (n_bursts_per_frame - overlap) for k in range(num_frames)] + + frame_slices: list[FrameSlice] = [] + # for is_land, cur_indicators in groupby(indicator): + for start in burst_start_idxs: + end = start + n_bursts_per_frame + frame_slices.append(FrameSlice(start, end, any(burst_is_in_land[start:end]))) + + return frame_slices + + +def make_land_optimized_frame_slices( + burst_is_in_land: ArrayLike, *, target_frame: int, min_frame: int, max_frame: int +): + """Create FrameSlices which optimize the area on land. + + Uses the boolean array `burst_is_in_land` to find slices corresponding + to frames with size >= `min_frame` , <= `max_frame`, and nominally `target_frame`. + """ + frame_slices = create_frame_slices(burst_is_in_land, min_frame=min_frame) + # parallelize the solve function calls + with ProcessPoolExecutor() as executor: + cumulative_slice_idxs = list( + tqdm( + executor.map( + _process_slice, + frame_slices, + repeat(target_frame), + repeat(min_frame), + repeat(max_frame), + ), + desc="Solving frame sizes", + total=len(frame_slices), + ) + ) + # Flatten the list of lists, since each result from executor.map is a list + cumulative_slice_idxs = sorted( + [item for sublist in cumulative_slice_idxs for item in sublist] + ) + return cumulative_slice_idxs + + +def _process_slice(slice_info, target_frame, min_frame, max_frame): + """Solve the current batch of bursts to find frame slices.""" + start_idx, end_idx, is_land = slice_info + n = end_idx - start_idx + cur_slices = solve( + n, + target=target_frame, + min_frame=min_frame, + max_frame=max_frame, + ) + # bump up so they refer to rows, instead of being from 0 + return [(s + start_idx, e + start_idx, is_land) for (s, e) in cur_slices] + + +@lru_cache(maxsize=1000) +def solve(n, target=TARGET_FRAME, max_frame=MAX_FRAME, min_frame=MIN_FRAME): + """Solve the dynamic programming problem to find the best frame sizes. + + Parameters + ---------- + n : int + The number of bursts to split into frames + target : int + The target number of bursts per frame + max_frame : int + The maximum number of bursts per frame + min_frame : int + The minimum number of bursts per frame + + Returns + ------- + list of tuples + The start and end indices of each frame + + Notes + ----- + This is posed as the same problem as the text justification problem. + "words" == bursts. "lines" == Frames. Where to "break the lines" == "group the frames" + + The differences between here and reference [1]_ are + + 1. Our "badness" has both a maximum and minimum size, beyond which the + badness is infinite. We also have a "target" so that the majority of + frames are exactly that size, and only occasionally to we adjust the + size to be smaller or larger. + 2. The slices are overlapping, so we need to add 1 to the length of each frame. + This is accounted for in the "badness" function. + + Reference + --------- + ..[1] https://ocw.mit.edu/courses/6-006-introduction-to-algorithms-fall-2011/resources/mit6_006f11_lec20/ + """ + # DP[i][0] is the minimum badness of the frames starting at i + # and DP[i][1] is the index of the next frame (to backtrack and get the slices) + DP = [None] * (n + 1) + DP[n] = (0, None) + for i in range(n - 1, -1, -1): + DP[i] = min( + ( + ( + DP[j][0] + + _badness( + i, j, target=target, max_frame=max_frame, min_frame=min_frame + ), + j, + ) + for j in range(i + 1, n + 1) + ), + key=lambda x: x[0], + ) + + # backtrack and get the slices + slices = [] + i = 0 + while i < n: + j = DP[i][1] + # To make the frames overlapping, make it j + 1 + # Just need to account for the final one, which can't be + # bigger than n. + end = min(j + 1, n) + slices.append((i, end)) + i = j + + return slices + + +def _badness(i, j, target=TARGET_FRAME, max_frame=MAX_FRAME, min_frame=MIN_FRAME): + """Compute the badness of a frame of length j - i. + + To account for the overlap, each frame will really be 1 bigger (except last) + so say "max_frame" is 12, even though it'll be 13 + If target is a fraction (like 9.5), than either side of the target is allowed. + with zero badness + """ + n = j - i + # make n+1 cuz it's bigger + if (n + 1) > max_frame or (n + 1) < min_frame: + return math.inf + else: + return math.floor(abs((n + 1) - target)) ** 3 + + +def create_frame_slices(is_land_indicator, min_frame=MIN_FRAME) -> list[FrameSlice]: + """Group adjacent frames that are too small into continuous land tracks.""" + indicator = is_land_indicator.copy() + ii = 0 + # First iter: make sure all land sequences are at least min_frame long + for is_land, v in groupby(indicator): + n_frames = len(list(v)) + ii += n_frames + if is_land and n_frames < min_frame: + indicator[ii - min_frame // 2 : ii + min_frame // 2 + 1] = True + + # Second iter: keep looping while there's any small water sequences. make them land + keep_looping = True + while keep_looping: + keep_looping = False + ii = 0 + for is_land, v in groupby(indicator): + n_frames = len(list(v)) + ii += n_frames + if not is_land and n_frames < min_frame: + keep_looping = True + indicator[ii - min_frame // 2 : ii + min_frame // 2 + 1] = True + # loop will break when we didn't adjust any water sequences + + consecutive_land_frames = Counter() + consecutive_water_frames = Counter() + frame_slices: list[FrameSlice] = [] + ii, i_prev = 0, 0 + for is_land, cur_indicators in groupby(indicator): + n_frames = len(list(cur_indicators)) + i_prev = ii + ii += n_frames + frame_slices.append(FrameSlice(i_prev, ii, bool(is_land))) + + if is_land: + consecutive_land_frames[n_frames] += 1 + else: + consecutive_water_frames[n_frames] += 1 + + print("Number of occurrences with consecutive land bursts:") + print(sorted(consecutive_land_frames.items())[:5], end=",... ") + print(sorted(consecutive_land_frames.items())[-5:]) + print("Number of occurrences with consecutive water bursts:") + print(sorted(consecutive_water_frames.items())[:5], end=",... ") + print(sorted(consecutive_water_frames.items())[-5:]) + + return frame_slices diff --git a/src/burst_db/version.py b/src/burst_db/version.py index 447dfd0..d20574e 100644 --- a/src/burst_db/version.py +++ b/src/burst_db/version.py @@ -6,8 +6,9 @@ # release history -Tag = collections.namedtuple('Tag', 'version date') +Tag = collections.namedtuple('Tag', ['version', 'date']) release_history = ( + Tag('0.1.2', '2023-07-24'), Tag('0.1.1', '2022-12-14'), Tag('0.1.0', '2022-12-08') )