Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue/india data analysis #128

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions examples/india_box_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""Box plots of max cumulative energy generation per system"""
import h5py
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# load hdf file with the generation data for each system
pv_systems_hdf = os.environ.get("PV_DATA_HDF")

systems_with_data = [
"56151",
"56709",
"58780",
"59687",
"59710",
"60294",
"60602",
"60673",
"66634",
"67861",
"71120",
"72742",
"73347",
"77684",
"77710",
"78186",
"79612",
"81408",
"82081",
"85738",
"86244",
"87410",
"90559",
"91554",
"97094",
"99833",
]


pv_systems = []

with h5py.File(pv_systems_hdf, "r") as f:
for system_id in systems_with_data:
df_pv_system = pd.DataFrame(np.array(f["timeseries"][system_id]["table"]))
df_pv_system["index"] = pd.to_datetime(df_pv_system["index"], unit="ns")
df_pv_system = df_pv_system.groupby(pd.Grouper(key="index", freq="D")).max()
df_pv_system["System ID"] = system_id
df_pv_system = pd.DataFrame(df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"])
df_pv_system["cumulative_energy_gen_kWh"] = (
df_pv_system["cumulative_energy_gen_Wh"] / 1000
).astype(float)
pv_systems.append(df_pv_system)


fig = go.Figure()
for pv_system in pv_systems:
fig.add_trace(
go.Box(
y=pv_system["cumulative_energy_gen_kWh"],
x=pv_system["System ID"],
name=pv_system["System ID"][0],
boxpoints="suspectedoutliers",
jitter=0.5,
whiskerwidth=0.2,
fillcolor="rgba(93, 164, 214, 0.5)",
marker_size=2,
line_width=1,
)
)
fig.update_layout(
title="Daily max values of cumulative energy generation per system",
yaxis=dict(
autorange=True,
showgrid=True,
zeroline=True,
gridcolor="rgb(255, 255, 255)",
gridwidth=1,
zerolinecolor="rgb(255, 255, 255)",
zerolinewidth=2,
),
),

y_data = df_pv_system["cumulative_energy_gen_kWh"]
x_data = df_pv_system["System ID"]

colors = [
"rgba(93, 164, 214, 0.5)",
"rgba(255, 144, 14, 0.5)",
"rgba(44, 160, 101, 0.5)",
"rgba(255, 65, 54, 0.5)",
"rgba(207, 114, 255, 0.5)",
"rgba(127, 96, 0, 0.5)",
]

margin = (
dict(
l=40,
r=30,
b=80,
t=100,
),
)
paper_bgcolor = ("rgb(243, 243, 243)",)
plot_bgcolor = ("rgb(243, 243, 243)",)
showlegend = (False,)
title = ("Daily Max Production per System for India",)
xaxis = (
dict(
autorange=True,
showgrid=True,
zeroline=True,
dtick=5,
gridcolor="rgb(255, 255, 255)",
gridwidth=1,
zerolinecolor="rgb(255, 255, 255)",
zerolinewidth=2,
),
)

fig.show()
62 changes: 62 additions & 0 deletions examples/india_chart_per_system.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Chart generation for a single system. This shows the cumulative energy
generation or instantaneous power generation over time. Some of the commented out code can be used to filter the data by date or plot max, mean, min, median values, etc.
"""
import os

import h5py
import numpy as np
import pandas as pd
import plotly.express as px

# load hdf file with the generation data for each system
pv_systems_hdf = os.environ.get("PV_DATA_HDF")

# this plots one system as a line chart in plotly
# plot the data as line graph in plotly

# can choose a system id to input here
system_id = "SYSTEM_ID"
# read the hdf file and get the data for the system id
with h5py.File(pv_systems_hdf, "r") as f:
pv_system_data = pd.DataFrame(np.array(f["timeseries"][system_id]["table"]))
pv_system_data["index"] = pd.to_datetime(pv_system_data["index"], unit="ns")
pv_system_data = pv_system_data.set_index("index", inplace=False)
# this code can be used to get the mean weekly ("W") or monthly ("M")
# production per system or the max daily production ("D")
# pv_system_data = pv_system_data.groupby(pd.Grouper(key="index", freq="M").mean

pv_system_data["System ID"] = system_id
pv_system_data = pd.DataFrame(
pv_system_data,
columns=[
"index",
"cumulative_energy_gen_Wh",
"System ID",
"instantaneous_power_gen_W",
],
)
# filter the data by date
# pv_system_data = pv_system_data.loc[
# pv_system_data.index > pd.Timestamp("2019-05-01 00:00:00")
# ]
# pv_system_data = pv_system_data.loc[
# pv_system_data.index < pd.Timestamp("2019-07-01 00:00:00")
# ]

pv_system_data["cumulative_energy_gen_kWh"] = (
pv_system_data["cumulative_energy_gen_Wh"] / 1000
).astype(float)

# plot the data as a line chart
fig = px.line(
pv_system_data,
x=pv_system_data.index,
# here you can swap between cumulative energy generation and instantaneous
# power generation on the y axis
y=pv_system_data["cumulative_energy_gen_kWh"],
title=f"Generation for System ID: {system_id}",
)
fig.update_layout(
xaxis_title="Time",
)
fig.show()
82 changes: 82 additions & 0 deletions examples/india_gantt_chart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Gantt chart for India PV systems. This shows where there are gaps in the data."""
import os

import h5py
import numpy as np
import pandas as pd
import plotly.express as px

# load hdf file with the generation data for each system
pv_data_hdf = os.environ.get("PV_DATA_HDF")

# these are the current systems with some data in the hdf file for india
systems_with_data = [
"56151",
"56709",
"58780",
"59687",
"59710",
"60294",
"60602",
"66634",
"71120",
"72742",
"73347",
"77684",
"77710",
"78186",
"79612",
"81408",
"82081",
"85738",
"86244",
"87410",
"90559",
"91554",
"97094",
"99833",
"100451",
]


pv_systems = []

# read the hdf file and get start and end dates of available data per site
with h5py.File(pv_data_hdf, "r") as f:
# loop through each pv system in the hdf file
for system_id in systems_with_data:
df = pd.DataFrame(np.array(f["timeseries"][system_id]["table"]))
df["index"] = pd.to_datetime(df["index"], unit="ns")
df = df[df["index"] > pd.Timestamp("2018-01-01")]
# set a value for the end date otherwise it registers as NaT
end_date = df["index"].iloc[-1]
df["index_difference"] = df["index"].diff()
# get startpoints of gaps in the data
df = df[df["index_difference"] > pd.Timedelta("1D")]
# get endpoints of gaps by looking at the difference between indexes
df["previous_endpoint"] = df["index"] - df["index_difference"]
df["endpoints"] = df["previous_endpoint"].shift(-1)
# set the last endpoint to the end date otherwise it registers as NaT
if len(df["endpoints"]) > 0:
df["endpoints"].iloc[-1] = end_date

# make a dictionary for the gantt chart to plot
# loop over the start and end dates and add to start_end_data dictionary
for index, row in df.iterrows():
start_end_data = {}
start_end_data["System ID"] = system_id
start_end_data["Start"] = row["index"]
start_end_data["Finish"] = row["endpoints"]

pv_systems.append(start_end_data)

# plot the data as gantt chart in plotly
fig = px.timeline(
pv_systems,
x_start="Start",
x_end="Finish",
y="System ID",
color="System ID",
title="Gantt Chart of PV Systems in India",
)
fig.show()
27 changes: 27 additions & 0 deletions examples/india_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
""" Example of plotting PVOutput India system locations on a map."""
import pandas as pd
import plotly.express as px

# load csv file with system metadata
pv_system_metadata = "./examples/pv_data/PVOutput_India_systems.csv"
# pv_system_metadata = os.environ.get("PV_SYSTEM_FILE")
pv_system_metadata = pd.read_csv(pv_system_metadata)
pv_systems_lat_lon = pd.DataFrame(
pv_system_metadata, columns=["system_id", "latitude", "longitude", "system_size_W"]
)
# remove systems that don't have a lat/lon coordinate
if pv_systems_lat_lon["latitude"].isnull().values.any():
pv_systems_lat_lon = pv_systems_lat_lon.dropna()


fig = px.scatter_geo(
pv_systems_lat_lon,
lat="latitude",
lon="longitude",
size="system_size_W",
color="system_size_W",
hover_name="system_id",
scope="asia",
title="PVOutput India System Locations",
)
fig.show()
103 changes: 103 additions & 0 deletions examples/india_mean_production.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Mean Production per System by time interval (month, week,etc.)
This example shows the mean daily production per system by month.
This makes a plot with 2 columns and half as many rows as there are systems with data.
"""
import os

import h5py
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# load hdf file with the generation data for each system
pv_data_hdf_file = os.environ.get("SYSTEM_DATA")

# these are the current systems with data in the hdf file for India
systems_with_data = [
"56151",
"56709",
"58780",
"59687",
"59710",
"60294",
"60602",
"60673",
"66634",
"67861",
"71120",
"72742",
"73347",
"77684",
"77710",
"78186",
"79612",
"81408",
"82081",
"85738",
"86244",
"87410",
"90559",
"91554",
"97094",
"99833",
]


# for the subplot titles, this function is used to get the row number
def row(row):
for row in range(0, len(pv_systems)):
if i == 1:
row = 1
elif i % 2 == 0:
row = int(i / 2)
else:
row = int((i + 1) / 2)
return row


pv_systems = []
# read the hdf file
with h5py.File(pv_data_hdf_file, "r") as f:
# loop through each pv system in the hdf file. some of the lines are commented out but can
# be used to filter the data by date or to get the mean weekly production per system
for system_id in systems_with_data:
df = pd.DataFrame(np.array(f["timeseries"][system_id]["table"]))
df["index"] = pd.to_datetime(df["index"], unit="ns")
# df["index"] = df[df["index"] > pd.Timestamp("2019-01-01")]
df_pv_system = df.groupby(pd.Grouper(key="index", freq="M")).mean()
df_pv_system["System ID"] = system_id
df_pv_system = pd.DataFrame(df_pv_system, columns=["cumulative_energy_gen_Wh", "System ID"])
# convert Wh to kWh
df_pv_system["cumulative_energy_gen_kWh"] = (
df_pv_system["cumulative_energy_gen_Wh"] / 1000
).astype(float)
pv_systems.append(df_pv_system)
i = 1
# make the plot with subplots
fig = make_subplots(
rows=len(pv_systems),
cols=2,
shared_xaxes=False,
horizontal_spacing=0.2,
vertical_spacing=0.02,
subplot_titles=[system_id for system_id in systems_with_data],
)
# loop through each system and add a line to the subplot
for i in range(1, len(pv_systems)):
if len(pv_systems[i - 1]) > 0:
fig.add_trace(
go.Scatter(
x=pv_systems[i - 1].index,
y=pv_systems[i - 1]["cumulative_energy_gen_Wh"],
name=pv_systems[i - 1]["System ID"][0],
mode="lines",
),
row=row(i),
col=[2 if i % 2 == 0 else 1],
)
i += 1
fig.update_yaxes(title_text="kWh")
fig.update_layout(height=3000, width=750, title_text="Mean Monthly Production per System")
fig.update_annotations(font_size=12)
fig.show()
Loading
Loading