From b25206c25e99fe01ddd3807ae3e4729237b2c62d Mon Sep 17 00:00:00 2001 From: Rachel Tipton Date: Mon, 11 Dec 2023 05:39:41 +0100 Subject: [PATCH 1/7] add systam capacity histogram --- examples/india_system_capacity_histogram.py | 26 +++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 examples/india_system_capacity_histogram.py diff --git a/examples/india_system_capacity_histogram.py b/examples/india_system_capacity_histogram.py new file mode 100644 index 0000000..2196f47 --- /dev/null +++ b/examples/india_system_capacity_histogram.py @@ -0,0 +1,26 @@ +"""Histogram of system capacities for India.""" +import os +import pandas as pd +import plotly.express as px + +# load csv file with system metadata +pv_systems_metadata = os.environ.get("PV_SYSTEM_METADATA") + +# read the csv file and build a dataframe +data = pd.read_csv(pv_systems_metadata) +pv_metadata = pd.DataFrame(data, columns=["system_id", "system_size_W"]) +pv_metadata["system_id"] = pv_metadata["system_id"].astype(str) +pv_metadata["system_size_W"] = (pv_metadata["system_size_W"] / 1000).astype(float) +pv_metadata.rename( + columns={"system_id": "System ID", "system_size_W": "System Capacity (kW)"}, + inplace=True, +) + +# plot the data as histogram in plotly +fig = px.histogram( + pv_metadata, + x="System ID", + y="System Capacity (kW)", + title="PVOutput India System Capacities", +) +fig.show() From 4005a8f429585f61856ed170be6dd4fe32771d1f Mon Sep 17 00:00:00 2001 From: Rachel Tipton Date: Mon, 11 Dec 2023 05:42:03 +0100 Subject: [PATCH 2/7] add subplots with mean monthly production --- examples/india_mean_production.py | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 examples/india_mean_production.py diff --git a/examples/india_mean_production.py b/examples/india_mean_production.py new file mode 100644 index 0000000..362c69b --- /dev/null +++ b/examples/india_mean_production.py @@ -0,0 +1,106 @@ +"""Mean Production per System by time interval (month, week,etc.) +This example shows the mean daily production per system by month. +This makes a plot with 2 columns and half as many rows as there are systems with data. +""" +import os +import pandas as pd +from plotly.subplots import make_subplots +import numpy as np +import plotly.graph_objects as go +import h5py + +# load hdf file with the generation data for each system +pv_data_hdf_file = os.environ.get("SYSTEM_DATA") + +# these are the current systems with data in the hdf file for India +systems_with_data = [ + "56151", + "56709", + "58780", + "59687", + "59710", + "60294", + "60602", + "60673", + "66634", + "67861", + "71120", + "72742", + "73347", + "77684", + "77710", + "78186", + "79612", + "81408", + "82081", + "85738", + "86244", + "87410", + "90559", + "91554", + "97094", + "99833", +] + + +# for the subplot titles, this function is used to get the row number +def row(row): + for row in range(0, len(pv_systems)): + if i == 1: + row = 1 + elif i % 2 == 0: + row = int(i / 2) + else: + row = int((i + 1) / 2) + return row + + +pv_systems = [] +# read the hdf file +with h5py.File(pv_data_hdf_file, "r") as f: + # loop through each pv system in the hdf file. some of the lines are commented out but can + # be used to filter the data by date or to get the mean weekly production per system + for system_id in systems_with_data: + df = pd.DataFrame(np.array(f["timeseries"][system_id]["table"])) + df["index"] = pd.to_datetime(df["index"], unit="ns") + # df["index"] = df[df["index"] > pd.Timestamp("2019-01-01")] + df_pv_system = df.groupby(pd.Grouper(key="index", freq="M")).mean() + df_pv_system["System ID"] = system_id + df_pv_system = pd.DataFrame( + df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"] + ) + # convert Wh to kWh + df_pv_system["cumulative_energy_gen_kWh"] = ( + df_pv_system["cumulative_energy_gen_Wh"] / 1000 + ).astype(float) + pv_systems.append(df_pv_system) + i = 1 + # make the plot with subplots + fig = make_subplots( + rows=len(pv_systems), + cols=2, + shared_xaxes=False, + horizontal_spacing=0.2, + vertical_spacing=0.02, + subplot_titles=[system_id for system_id in systems_with_data], + ) + # loop through each system and add a line to the subplot + for i in range(1, len(pv_systems)): + if len(pv_systems[i - 1]) > 0: + fig.add_trace( + go.Scatter( + x=pv_systems[i - 1].index, + y=pv_systems[i - 1]["cumulative_energy_gen_Wh"], + name=pv_systems[i - 1]["System ID"][0], + mode="lines", + ), + row=row(i), + col=[2 if i % 2 == 0 else 1], + ) + i += 1 + fig.update_yaxes(title_text="kWh") + fig.update_layout( + height=3000, width=750, title_text="Mean Monthly Production per System" + ) + fig.update_annotations(font_size=12) + fig.show() From 7688761634bc19effb8a6a37aee4edca1bc3ef54 Mon Sep 17 00:00:00 2001 From: Rachel Tipton Date: Mon, 11 Dec 2023 05:42:35 +0100 Subject: [PATCH 3/7] add map with location of india sites --- examples/india_map.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 examples/india_map.py diff --git a/examples/india_map.py b/examples/india_map.py new file mode 100644 index 0000000..c2fdca9 --- /dev/null +++ b/examples/india_map.py @@ -0,0 +1,28 @@ +""" Example of plotting PVOutput India system locations on a map.""" +import os +import plotly.express as px +import pandas as pd + +# load csv file with system metadata +pv_system_metadata = "./examples/pv_data/PVOutput_India_systems.csv" +# pv_system_metadata = os.environ.get("PV_SYSTEM_FILE") +pv_system_metadata = pd.read_csv(pv_system_metadata) +pv_systems_lat_lon = pd.DataFrame( + pv_system_metadata, columns=["system_id", "latitude", "longitude", "system_size_W"] +) +# remove systems that don't have a lat/lon coordinate +if pv_systems_lat_lon["latitude"].isnull().values.any(): + pv_systems_lat_lon = pv_systems_lat_lon.dropna() + + +fig = px.scatter_geo( + pv_systems_lat_lon, + lat="latitude", + lon="longitude", + size="system_size_W", + color="system_size_W", + hover_name="system_id", + scope="asia", + title="PVOutput India System Locations", +) +fig.show() From 85eb717fc02c0498051adaa4f4fde8a006ee3f6b Mon Sep 17 00:00:00 2001 From: Rachel Tipton Date: Mon, 11 Dec 2023 05:43:19 +0100 Subject: [PATCH 4/7] add gantt chart for pv site data availability --- examples/india_gantt_chart.py | 81 +++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 examples/india_gantt_chart.py diff --git a/examples/india_gantt_chart.py b/examples/india_gantt_chart.py new file mode 100644 index 0000000..a26d20a --- /dev/null +++ b/examples/india_gantt_chart.py @@ -0,0 +1,81 @@ +"""Gantt chart for India PV systems. This shows where there are gaps in the data.""" +import os +import pandas as pd +import numpy as np +import plotly.express as px +import h5py + +# load hdf file with the generation data for each system +pv_data_hdf = os.environ.get("PV_DATA_HDF") + +# these are the current systems with some data in the hdf file for india +systems_with_data = [ + "56151", + "56709", + "58780", + "59687", + "59710", + "60294", + "60602", + "66634", + "71120", + "72742", + "73347", + "77684", + "77710", + "78186", + "79612", + "81408", + "82081", + "85738", + "86244", + "87410", + "90559", + "91554", + "97094", + "99833", + "100451", +] + + +pv_systems = [] + +# read the hdf file and get start and end dates of available data per site +with h5py.File(pv_data_hdf, "r") as f: + # loop through each pv system in the hdf file + for system_id in systems_with_data: + df = pd.DataFrame(np.array(f["timeseries"][system_id]["table"])) + df["index"] = pd.to_datetime(df["index"], unit="ns") + df = df[df["index"] > pd.Timestamp("2018-01-01")] + # set a value for the end date otherwise it registers as NaT + end_date = df["index"].iloc[-1] + df["index_difference"] = df["index"].diff() + # get startpoints of gaps in the data + df = df[df["index_difference"] > pd.Timedelta("1D")] + # get endpoints of gaps by looking at the difference between indexes + df["previous_endpoint"] = df["index"] - df["index_difference"] + df["endpoints"] = df["previous_endpoint"].shift(-1) + # set the last endpoint to the end date otherwise it registers as NaT + if len(df["endpoints"]) > 0: + df["endpoints"].iloc[-1] = end_date + + # make a dictionary for the gantt chart to plot + # loop over the start and end dates and add to start_end_data dictionary + for index, row in df.iterrows(): + start_end_data = {} + start_end_data["System ID"] = system_id + start_end_data["Start"] = row["index"] + start_end_data["Finish"] = row["endpoints"] + + pv_systems.append(start_end_data) + + # plot the data as gantt chart in plotly + fig = px.timeline( + pv_systems, + x_start="Start", + x_end="Finish", + y="System ID", + color="System ID", + title="Gantt Chart of PV Systems in India", + ) + fig.show() From 5e00d7b8fb795bf1021d13b85ce8960790efef6b Mon Sep 17 00:00:00 2001 From: Rachel Tipton Date: Mon, 11 Dec 2023 05:45:29 +0100 Subject: [PATCH 5/7] add chart for individual pv system --- examples/india_chart_per_system.py | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 examples/india_chart_per_system.py diff --git a/examples/india_chart_per_system.py b/examples/india_chart_per_system.py new file mode 100644 index 0000000..2fcfa59 --- /dev/null +++ b/examples/india_chart_per_system.py @@ -0,0 +1,60 @@ +"""Chart generation for a single system. This shows the cumulative energy +generation or instantaneous power generation over time. Some of the commented out code can be used to filter the data by date or plot max, mean, min, median values, etc.""" +import os +import pandas as pd +import numpy as np +import plotly.express as px +import h5py + +# load hdf file with the generation data for each system +pv_systems_hdf = os.environ.get("PV_DATA_HDF") + +# this plots one system as a line chart in plotly +# plot the data as line graph in plotly + +# can choose a system id to input here +system_id = "SYSTEM_ID" +# read the hdf file and get the data for the system id +with h5py.File(pv_systems_hdf, "r") as f: + pv_system_data = pd.DataFrame(np.array(f["timeseries"][system_id]["table"])) + pv_system_data["index"] = pd.to_datetime(pv_system_data["index"], unit="ns") + pv_system_data = pv_system_data.set_index("index", inplace=False) + # this code can be used to get the mean weekly ("W") or monthly ("M") + # production per system or the max daily production ("D") + # pv_system_data = pv_system_data.groupby(pd.Grouper(key="index", freq="M").mean + + pv_system_data["System ID"] = system_id + pv_system_data = pd.DataFrame( + pv_system_data, + columns=[ + "index", + "cumulative_energy_gen_Wh", + "System ID", + "instantaneous_power_gen_W", + ], + ) + # filter the data by date + # pv_system_data = pv_system_data.loc[ + # pv_system_data.index > pd.Timestamp("2019-05-01 00:00:00") + # ] + # pv_system_data = pv_system_data.loc[ + # pv_system_data.index < pd.Timestamp("2019-07-01 00:00:00") + # ] + + pv_system_data["cumulative_energy_gen_kWh"] = ( + pv_system_data["cumulative_energy_gen_Wh"] / 1000 + ).astype(float) + + # plot the data as a line chart + fig = px.line( + pv_system_data, + x=pv_system_data.index, + # here you can swap between cumulative energy generation and instantaneous + # power generation on the y axis + y=pv_system_data["cumulative_energy_gen_kWh"], + title=f"Generation for System ID: {system_id}", + ) + fig.update_layout( + xaxis_title="Time", + ) + fig.show() From bacdb5aa7241f8843d0ef0edb5b4d65a2c97f526 Mon Sep 17 00:00:00 2001 From: Rachel Tipton Date: Mon, 11 Dec 2023 05:46:08 +0100 Subject: [PATCH 6/7] add box-plot for max daily value --- examples/india_box_plots.py | 122 ++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 examples/india_box_plots.py diff --git a/examples/india_box_plots.py b/examples/india_box_plots.py new file mode 100644 index 0000000..e393b1e --- /dev/null +++ b/examples/india_box_plots.py @@ -0,0 +1,122 @@ +"""Box plots of max cumulative energy generation per system""" +import pandas as pd +import numpy as np +import plotly.graph_objects as go +import h5py + +# load hdf file with the generation data for each system +pv_systems_hdf = os.environ.get("PV_DATA_HDF") + +systems_with_data = [ + "56151", + "56709", + "58780", + "59687", + "59710", + "60294", + "60602", + "60673", + "66634", + "67861", + "71120", + "72742", + "73347", + "77684", + "77710", + "78186", + "79612", + "81408", + "82081", + "85738", + "86244", + "87410", + "90559", + "91554", + "97094", + "99833", +] + + +pv_systems = [] + +with h5py.File(pv_systems_hdf, "r") as f: + for system_id in systems_with_data: + df_pv_system = pd.DataFrame(np.array(f["timeseries"][system_id]["table"])) + df_pv_system["index"] = pd.to_datetime(df_pv_system["index"], unit="ns") + df_pv_system = df_pv_system.groupby(pd.Grouper(key="index", freq="D")).max() + df_pv_system["System ID"] = system_id + df_pv_system = pd.DataFrame( + df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"] + ) + df_pv_system["cumulative_energy_gen_kWh"] = ( + df_pv_system["cumulative_energy_gen_Wh"] / 1000 + ).astype(float) + pv_systems.append(df_pv_system) + + +fig = go.Figure() +for pv_system in pv_systems: + fig.add_trace( + go.Box( + y=pv_system["cumulative_energy_gen_kWh"], + x=pv_system["System ID"], + name=pv_system["System ID"][0], + boxpoints="suspectedoutliers", + jitter=0.5, + whiskerwidth=0.2, + fillcolor="rgba(93, 164, 214, 0.5)", + marker_size=2, + line_width=1, + ) + ) + fig.update_layout( + title="Daily max values of cumulative energy generation per system", + yaxis=dict( + autorange=True, + showgrid=True, + zeroline=True, + gridcolor="rgb(255, 255, 255)", + gridwidth=1, + zerolinecolor="rgb(255, 255, 255)", + zerolinewidth=2, + ), + ), + + y_data = df_pv_system["cumulative_energy_gen_kWh"] + x_data = df_pv_system["System ID"] + + colors = [ + "rgba(93, 164, 214, 0.5)", + "rgba(255, 144, 14, 0.5)", + "rgba(44, 160, 101, 0.5)", + "rgba(255, 65, 54, 0.5)", + "rgba(207, 114, 255, 0.5)", + "rgba(127, 96, 0, 0.5)", + ] + + margin = ( + dict( + l=40, + r=30, + b=80, + t=100, + ), + ) + paper_bgcolor = ("rgb(243, 243, 243)",) + plot_bgcolor = ("rgb(243, 243, 243)",) + showlegend = (False,) + title = ("Daily Max Production per System for India",) + xaxis = ( + dict( + autorange=True, + showgrid=True, + zeroline=True, + dtick=5, + gridcolor="rgb(255, 255, 255)", + gridwidth=1, + zerolinecolor="rgb(255, 255, 255)", + zerolinewidth=2, + ), + ) + +fig.show() From 35625dd7cb6192529f072dacc26c361eca73c9a3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Dec 2023 04:58:32 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/india_box_plots.py | 8 +++----- examples/india_chart_per_system.py | 8 +++++--- examples/india_gantt_chart.py | 5 +++-- examples/india_map.py | 3 +-- examples/india_mean_production.py | 15 ++++++--------- examples/india_system_capacity_histogram.py | 1 + pvoutput/mapscraper.py | 1 + pvoutput/pvoutput.py | 2 +- scripts/fetch_pv_timeseries.py | 11 ++++++----- 9 files changed, 27 insertions(+), 27 deletions(-) diff --git a/examples/india_box_plots.py b/examples/india_box_plots.py index e393b1e..0a0a057 100644 --- a/examples/india_box_plots.py +++ b/examples/india_box_plots.py @@ -1,8 +1,8 @@ """Box plots of max cumulative energy generation per system""" -import pandas as pd +import h5py import numpy as np +import pandas as pd import plotly.graph_objects as go -import h5py # load hdf file with the generation data for each system pv_systems_hdf = os.environ.get("PV_DATA_HDF") @@ -45,9 +45,7 @@ df_pv_system["index"] = pd.to_datetime(df_pv_system["index"], unit="ns") df_pv_system = df_pv_system.groupby(pd.Grouper(key="index", freq="D")).max() df_pv_system["System ID"] = system_id - df_pv_system = pd.DataFrame( - df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"] - ) + df_pv_system = pd.DataFrame(df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"]) df_pv_system["cumulative_energy_gen_kWh"] = ( df_pv_system["cumulative_energy_gen_Wh"] / 1000 ).astype(float) diff --git a/examples/india_chart_per_system.py b/examples/india_chart_per_system.py index 2fcfa59..7dc8667 100644 --- a/examples/india_chart_per_system.py +++ b/examples/india_chart_per_system.py @@ -1,10 +1,12 @@ """Chart generation for a single system. This shows the cumulative energy -generation or instantaneous power generation over time. Some of the commented out code can be used to filter the data by date or plot max, mean, min, median values, etc.""" +generation or instantaneous power generation over time. Some of the commented out code can be used to filter the data by date or plot max, mean, min, median values, etc. +""" import os -import pandas as pd + +import h5py import numpy as np +import pandas as pd import plotly.express as px -import h5py # load hdf file with the generation data for each system pv_systems_hdf = os.environ.get("PV_DATA_HDF") diff --git a/examples/india_gantt_chart.py b/examples/india_gantt_chart.py index a26d20a..c1654b7 100644 --- a/examples/india_gantt_chart.py +++ b/examples/india_gantt_chart.py @@ -1,9 +1,10 @@ """Gantt chart for India PV systems. This shows where there are gaps in the data.""" import os -import pandas as pd + +import h5py import numpy as np +import pandas as pd import plotly.express as px -import h5py # load hdf file with the generation data for each system pv_data_hdf = os.environ.get("PV_DATA_HDF") diff --git a/examples/india_map.py b/examples/india_map.py index c2fdca9..c849402 100644 --- a/examples/india_map.py +++ b/examples/india_map.py @@ -1,7 +1,6 @@ """ Example of plotting PVOutput India system locations on a map.""" -import os -import plotly.express as px import pandas as pd +import plotly.express as px # load csv file with system metadata pv_system_metadata = "./examples/pv_data/PVOutput_India_systems.csv" diff --git a/examples/india_mean_production.py b/examples/india_mean_production.py index 362c69b..0b461ca 100644 --- a/examples/india_mean_production.py +++ b/examples/india_mean_production.py @@ -3,11 +3,12 @@ This makes a plot with 2 columns and half as many rows as there are systems with data. """ import os -import pandas as pd -from plotly.subplots import make_subplots + +import h5py import numpy as np +import pandas as pd import plotly.graph_objects as go -import h5py +from plotly.subplots import make_subplots # load hdf file with the generation data for each system pv_data_hdf_file = os.environ.get("SYSTEM_DATA") @@ -66,9 +67,7 @@ def row(row): # df["index"] = df[df["index"] > pd.Timestamp("2019-01-01")] df_pv_system = df.groupby(pd.Grouper(key="index", freq="M")).mean() df_pv_system["System ID"] = system_id - df_pv_system = pd.DataFrame( - df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"] - ) + df_pv_system = pd.DataFrame(df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"]) # convert Wh to kWh df_pv_system["cumulative_energy_gen_kWh"] = ( df_pv_system["cumulative_energy_gen_Wh"] / 1000 @@ -99,8 +98,6 @@ def row(row): ) i += 1 fig.update_yaxes(title_text="kWh") - fig.update_layout( - height=3000, width=750, title_text="Mean Monthly Production per System" - ) + fig.update_layout(height=3000, width=750, title_text="Mean Monthly Production per System") fig.update_annotations(font_size=12) fig.show() diff --git a/examples/india_system_capacity_histogram.py b/examples/india_system_capacity_histogram.py index 2196f47..fda4c2e 100644 --- a/examples/india_system_capacity_histogram.py +++ b/examples/india_system_capacity_histogram.py @@ -1,5 +1,6 @@ """Histogram of system capacities for India.""" import os + import pandas as pd import plotly.express as px diff --git a/pvoutput/mapscraper.py b/pvoutput/mapscraper.py index c64f1f1..9c0c8d1 100644 --- a/pvoutput/mapscraper.py +++ b/pvoutput/mapscraper.py @@ -350,6 +350,7 @@ def clean_soup(soup): """Function to clean scraped soup object. Note that the downloaded soup could change over time. + Args: soup: bs4.BeautifulSoup diff --git a/pvoutput/pvoutput.py b/pvoutput/pvoutput.py index 0e4ea35..6993985 100644 --- a/pvoutput/pvoutput.py +++ b/pvoutput/pvoutput.py @@ -924,7 +924,7 @@ def _download_multiple_worker( ) else: total_rows += len(timeseries) - _LOG.info(f'Adding timezone {timezone} to {total_rows} rows') + _LOG.info(f"Adding timezone {timezone} to {total_rows} rows") timeseries = timeseries.tz_localize(timezone) _LOG.info( "system_id: %d: %d rows retrieved: %s to %s", diff --git a/scripts/fetch_pv_timeseries.py b/scripts/fetch_pv_timeseries.py index ac7d4a6..39b6eda 100644 --- a/scripts/fetch_pv_timeseries.py +++ b/scripts/fetch_pv_timeseries.py @@ -21,14 +21,15 @@ or create and use a ~/.pvoutput.yml file as described in the PVOutput library documentation """ -from pvoutput import * - -import click as cl import datetime as dt +import logging +import pathlib import sys + +import click as cl import pandas as pd -import pathlib -import logging + +from pvoutput import * @cl.command()