Skip to content

Commit

Permalink
Merge pull request #35 from ouslan/34-moving
Browse files Browse the repository at this point in the history
34 moving
  • Loading branch information
ouslan authored Oct 7, 2024
2 parents 6eb7935 + bf1b708 commit b3f3d23
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 12 deletions.
110 changes: 98 additions & 12 deletions notebooks/import.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,111 @@ load_dotenv()

```{python}
import polars as pl
import plotly.graph_objects as go
import requests
from src.jp_imports.data_process import DataProcess
from src.jp_imports.data_pull import DataPull
d = DataProcess("data/")
df = d.process_price().collect()
```


```{python}
# Existing code
time = "monthly"
types = "hs"
df = d.process_int_org(time, types, False)
df = df.with_columns(pl.col("imports_qty", "exports_qty").replace(0, 1))
df = df.with_columns(hs4=pl.col("hs").str.slice(0, 4))
df = df.group_by(pl.col("hs4", "month", "year")).agg(
pl.col("imports").sum().alias("imports"),
pl.col("exports").sum().alias("exports"),
pl.col("imports_qty").sum().alias("imports_qty"),
pl.col("exports_qty").sum().alias("exports_qty")
)
df = df.with_columns(
price_imports=pl.col("imports") / pl.col("imports_qty"),
price_exports=pl.col("exports") / pl.col("exports_qty")
)
df = df.with_columns(date=pl.datetime(pl.col("year"), pl.col("month"), 1))
# Sort the DataFrame by the date column
df = df.sort("date")
# Now you can safely use group_by_dynamic
result = df.with_columns(
pl.col("price_imports").rolling_mean(window_size=3, min_periods=1).over("hs4").alias("moving_price_imports"),
pl.col("price_exports").rolling_mean(window_size=3, min_periods=1).over("hs4").alias("moving_price_exports"),
pl.col("price_imports").rolling_std(window_size=3, min_periods=1).over("hs4").alias("moving_price_imports_std"),
pl.col("price_exports").rolling_std(window_size=3, min_periods=1).over("hs4").alias("moving_price_exports_std"),
)
results = result.with_columns(
pl.col("moving_price_imports").rank("ordinal").over("date").alias("rank_imports").cast(pl.Int64),
pl.col("moving_price_exports").rank("ordinal").over("date").alias("rank_exports").cast(pl.Int64),
upper_band_imports = pl.col("moving_price_imports") + 2 * pl.col("moving_price_imports_std"),
lower_band_imports = pl.col("moving_price_imports") - 2 * pl.col("moving_price_imports_std"),
upper_band_exports = pl.col("moving_price_exports") + 2 * pl.col("moving_price_exports_std"),
lower_band_exports = pl.col("moving_price_exports") - 2 * pl.col("moving_price_exports_std"),
)
results = df.join(results, on=["date", "hs4"], how="left", validate="1:1")
# Assuming 'results' already has the necessary columns and is sorted by date and hs4
tmp = results.with_columns(
pl.col("moving_price_imports").pct_change().over("date", "hs4").alias("pct_change_imports")
).sort(by=["date", "hs4"])
# To get the percentage change for the same month of the previous year
# First, create a column for the previous year's value
tmp = tmp.with_columns(
pl.when(pl.col("date").dt.year() > 1) # Ensure there's a previous year to compare
.then(pl.col("moving_price_imports").shift(12)) # Shift by 12 months
.otherwise(None)
.alias("prev_year_imports"),
)
tmp = tmp.with_columns(
pl.when(pl.col("date").dt.year() > 1) # Ensure there's a previous year to compare
.then(pl.col("moving_price_exports").shift(12)) # Shift by 12 months
.otherwise(None)
.alias("prev_year_exports"),
)
tmp = tmp.with_columns(
pl.when(pl.col("date").dt.year() > 1) # Ensure there's a previous year to compare
.then(pl.col("rank_imports").shift(12)) # Shift by 12 months
.otherwise(None)
.alias("prev_year_rank_imports"),
)
tmp = tmp.with_columns(
pl.when(pl.col("date").dt.year() > 1) # Ensure there's a previous year to compare
.then(pl.col("rank_exports").shift(12)) # Shift by 12 months
.otherwise(None)
.alias("prev_year_rank_exports")
)
# Now calculate the percentage change
tmp = tmp.with_columns(
((pl.col("moving_price_imports") - pl.col("prev_year_imports")) / pl.col("prev_year_imports")).alias("pct_change_imports_year_over_year"),
((pl.col("moving_price_exports") - pl.col("prev_year_exports")) / pl.col("prev_year_exports")).alias("pct_change_exports_year_over_year"),
(pl.col("rank_imports") - pl.col("prev_year_rank_imports")).alias("rank_imports_change_year_over_year"),
(pl.col("rank_exports").cast(pl.Int64) - pl.col("prev_year_rank_exports").cast(pl.Int64)).alias("rank_exports_change_year_over_year")
).sort(by=["date", "hs4"])
tmp = tmp.collect()
tmp
```

```{python}
# time series API: https://api.census.gov/data/timeseries.html
key = os.getenv("CENSUS_API_KEY")
base = 'https://api.census.gov/data/timeseries/'
flow = 'intltrade/imports/statehs'
param = 'CTY_CODE,CTY_NAME,STATE,MONTH'
# date needs manual update from release schedule:
# https://www.census.gov/foreign-trade/reference/release_schedule.html
time = '2023'
url = f'{base}{flow}?get={param}&key={key}&time={time}'
r = requests.get(url).json()
data = df.filter(pl.col("hs4") == "9880")
# Create the figure with Bollinger Bands
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['date'], y=data['price_imports'], name='Value Growth'))
fig.add_trace(go.Scatter(x=data['date'], y=data['moving_price_imports'], name='Mean'))
fig.add_trace(go.Scatter(x=data['date'], y=data['upper_band_imports'], name='Upper'))
fig.add_trace(go.Scatter(x=data['date'], y=data['lower_band_imports'], name='Lower'))
fig.update_layout(title=f'Bollinger Bands for', xaxis_title='Date', yaxis_title='Value')
```

```{python}
Expand Down
80 changes: 80 additions & 0 deletions src/jp_imports/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,86 @@ def process_int_base(self, agr:bool=False) -> pl.LazyFrame:
else:
return df

def process_price(self) -> pl.LazyFrame:
df = self.process_int_org("monthly", "hs")
df = df.with_columns(pl.col("imports_qty", "exports_qty").replace(0, 1))
df = df.with_columns(hs4=pl.col("hs").str.slice(0, 4))

df = df.group_by(pl.col("hs4", "month", "year")).agg(
pl.col("imports").sum().alias("imports"),
pl.col("exports").sum().alias("exports"),
pl.col("imports_qty").sum().alias("imports_qty"),
pl.col("exports_qty").sum().alias("exports_qty")
)

df = df.with_columns(
price_imports=pl.col("imports") / pl.col("imports_qty"),
price_exports=pl.col("exports") / pl.col("exports_qty")
)

df = df.with_columns(date=pl.datetime(pl.col("year"), pl.col("month"), 1))

# Sort the DataFrame by the date column
df = df.sort("date")

# Now you can safely use group_by_dynamic
result = df.with_columns(
pl.col("price_imports").rolling_mean(window_size=3, min_periods=1).over("hs4").alias("moving_price_imports"),
pl.col("price_exports").rolling_mean(window_size=3, min_periods=1).over("hs4").alias("moving_price_exports"),
pl.col("price_imports").rolling_std(window_size=3, min_periods=1).over("hs4").alias("moving_price_imports_std"),
pl.col("price_exports").rolling_std(window_size=3, min_periods=1).over("hs4").alias("moving_price_exports_std"),
)
results = result.with_columns(
pl.col("moving_price_imports").rank("ordinal").over("date").alias("rank_imports").cast(pl.Int64),
pl.col("moving_price_exports").rank("ordinal").over("date").alias("rank_exports").cast(pl.Int64),
upper_band_imports = pl.col("moving_price_imports") + 2 * pl.col("moving_price_imports_std"),
lower_band_imports = pl.col("moving_price_imports") - 2 * pl.col("moving_price_imports_std"),
upper_band_exports = pl.col("moving_price_exports") + 2 * pl.col("moving_price_exports_std"),
lower_band_exports = pl.col("moving_price_exports") - 2 * pl.col("moving_price_exports_std"),
)
results = df.join(results, on=["date", "hs4"], how="left", validate="1:1")

# Assuming 'results' already has the necessary columns and is sorted by date and hs4
tmp = results.with_columns(
pl.col("moving_price_imports").pct_change().over("date", "hs4").alias("pct_change_imports")
).sort(by=["date", "hs4"])

# To get the percentage change for the same month of the previous year
# First, create a column for the previous year's value
tmp = tmp.with_columns(
pl.when(pl.col("date").dt.year() > 1) # Ensure there's a previous year to compare
.then(pl.col("moving_price_imports").shift(12)) # Shift by 12 months
.otherwise(None)
.alias("prev_year_imports"),
)
tmp = tmp.with_columns(
pl.when(pl.col("date").dt.year() > 1) # Ensure there's a previous year to compare
.then(pl.col("moving_price_exports").shift(12)) # Shift by 12 months
.otherwise(None)
.alias("prev_year_exports"),
)
tmp = tmp.with_columns(
pl.when(pl.col("date").dt.year() > 1) # Ensure there's a previous year to compare
.then(pl.col("rank_imports").shift(12)) # Shift by 12 months
.otherwise(None)
.alias("prev_year_rank_imports"),
)
tmp = tmp.with_columns(
pl.when(pl.col("date").dt.year() > 1) # Ensure there's a previous year to compare
.then(pl.col("rank_exports").shift(12)) # Shift by 12 months
.otherwise(None)
.alias("prev_year_rank_exports")
)

# Now calculate the percentage change
tmp = tmp.with_columns(
((pl.col("moving_price_imports") - pl.col("prev_year_imports")) / pl.col("prev_year_imports")).alias("pct_change_imports_year_over_year"),
((pl.col("moving_price_exports") - pl.col("prev_year_exports")) / pl.col("prev_year_exports")).alias("pct_change_exports_year_over_year"),
(pl.col("rank_imports") - pl.col("prev_year_rank_imports")).alias("rank_imports_change_year_over_year"),
(pl.col("rank_exports").cast(pl.Int64) - pl.col("prev_year_rank_exports").cast(pl.Int64)).alias("rank_exports_change_year_over_year")
).sort(by=["date", "hs4"])
return tmp

def process_cat(self, df:pl.DataFrame, switch:list):

match switch:
Expand Down

0 comments on commit b3f3d23

Please sign in to comment.