From 480742ff0a84065b7433e944a051227cdcfc50b0 Mon Sep 17 00:00:00 2001 From: Eric E Monson Date: Thu, 22 Oct 2020 12:07:53 -0400 Subject: [PATCH] added Altair US emissions rendering timings --- Altair_NCexplore.ipynb | 310 +++++++++++++++---------- Altair_UStimings.ipynb | 505 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 691 insertions(+), 124 deletions(-) create mode 100644 Altair_UStimings.ipynb diff --git a/Altair_NCexplore.ipynb b/Altair_NCexplore.ipynb index 6b0bbb0..b2e107d 100644 --- a/Altair_NCexplore.ipynb +++ b/Altair_NCexplore.ipynb @@ -20,7 +20,8 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import altair as alt" + "import altair as alt\n", + "import time" ] }, { @@ -210,12 +211,12 @@ "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -279,7 +280,20 @@ { "cell_type": "markdown", "metadata": {}, - "source": [] + "source": [ + "## Altair built-in time aggregation and resampling\n", + "\n", + "These are called timeunit transforms: https://altair-viz.github.io/user_guide/transform/timeunit.html\n", + "\n", + "#### Altair prefers that you explicitly specify the type of variable\n", + "\n", + "- `N` – Nominal *(unordered categorical)*\n", + "- `O` – Ordinal *(ordered categorical)*\n", + "- `Q` – Quantitative *(numbers)*\n", + "- `T` – Temporal *(time)*\n", + "\n", + "### Daily emissions patterns heatmap" + ] }, { "cell_type": "code", @@ -290,12 +304,12 @@ "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -350,9 +364,10 @@ } ], "source": [ - "alt.Chart(df).mark_bar().encode(\n", + "alt.Chart(df.reset_index()).mark_rect().encode(\n", " y = 'county_site:N',\n", - " x = 'mean(measure):Q'\n", + " x = 'hours(tstamp):T',\n", + " color = 'mean(measure)'\n", ")" ] }, @@ -360,18 +375,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Altair built-in time aggregation and resampling\n", - "\n", - "These are called timeunit transforms: https://altair-viz.github.io/user_guide/transform/timeunit.html\n", - "\n", - "#### Altair prefers that you explicitly specify the type of variable\n", - "\n", - "- `N` – Nominal *(unordered categorical)*\n", - "- `O` – Ordinal *(ordered categorical)*\n", - "- `Q` – Quantitative *(numbers)*\n", - "- `T` – Temporal *(time)*\n", + "### Line plot only requres switching three things\n", "\n", - "### Daily emissions patterns heatmap" + "- mark_rect() to mark_line()\n", + "- y to color\n", + "- color to y" ] }, { @@ -383,12 +391,12 @@ "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -443,10 +451,10 @@ } ], "source": [ - "alt.Chart(df.reset_index()).mark_rect().encode(\n", - " y = 'county_site:N',\n", + "alt.Chart(df.reset_index()).mark_line().encode(\n", + " color = 'county_site:N',\n", " x = 'hours(tstamp):T',\n", - " color = 'mean(measure)'\n", + " y = 'mean(measure)'\n", ")" ] }, @@ -454,11 +462,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Line plot only requres switching three things\n", - "\n", - "- mark_rect() to mark_line()\n", - "- y to color\n", - "- color to y" + "### Easy switch to months instead of hours" ] }, { @@ -470,12 +474,12 @@ "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -532,7 +536,7 @@ "source": [ "alt.Chart(df.reset_index()).mark_line().encode(\n", " color = 'county_site:N',\n", - " x = 'hours(tstamp):T',\n", + " x = 'month(tstamp):T',\n", " y = 'mean(measure)'\n", ")" ] @@ -541,7 +545,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Easy switch to months instead of hours" + "## Speed\n", + "\n", + "For some reason can't really use `%timeit` to measure whole rendering, so need to execute\n", + "\n", + "**See how much faster this is when we let Pandas do the aggregation and only feed Altair a small dataset!**" ] }, { @@ -553,12 +561,12 @@ "data": { "text/html": [ "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -613,10 +621,12 @@ } ], "source": [ - "alt.Chart(df.reset_index()).mark_line().encode(\n", - " color = 'county_site:N',\n", - " x = 'month(tstamp):T',\n", - " y = 'mean(measure)'\n", + "grp = df.groupby(['county_site',df.index.month]).agg({'measure':'mean'}).reset_index()\n", + "\n", + "alt.Chart(grp).mark_line().encode(\n", + " color='county_site:N',\n", + " x='tstamp:Q',\n", + " y='measure'\n", ")" ] }, @@ -631,9 +641,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Speed actually looking good!\n", + "### Hierarchy a bit awkward\n", "\n", - "Maybe this dataset is too small, but it used to be that with large data it was much faster to pre-transform your data in Pandas and then feed it to Altair. Here I don't see any difference, so maybe they've made big improvements lately?!" + "Altair can handle two levels of hierarchy in \"grouping\" with a combination of axis and facets. Since not all sites are in all counties, need to do the equivalent of Pandas categorical `observed=True`, which is `resolve_scale(y='independent')`" ] }, { @@ -642,79 +652,68 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "9.32 ms ± 29.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" - ] - } - ], - "source": [ - "%timeit alt.Chart(df).mark_bar().encode(y='county_site:N',x='mean(measure)')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9.37 ms ± 24.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" - ] + "data": { + "text/html": [ + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" } ], - "source": [ - "grp = df.groupby('county_site').agg({'measure':'mean'}).reset_index()\n", - "\n", - "%timeit alt.Chart(grp).mark_bar().encode(y='county_site:N',x='measure:Q')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_reset = df.reset_index()\n", - "\n", - "%timeit alt.Chart(df_reset).mark_line().encode(color='county_site:N',x='month(tstamp):T',y='mean(measure)')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grp = df.groupby(['county_site',df.index.month]).agg({'measure':'mean'}).reset_index()\n", - "\n", - "%timeit alt.Chart(grp).mark_line().encode(color='county_site:N',x='tstamp:O',y='measure')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hierarchy a bit awkward\n", - "\n", - "Altair can handle two levels of hierarchy in \"grouping\" with a combination of axis and facets. Since not all sites are in all counties, need to do the equivalent of Pandas categorical `observed=True`, which is `resolve_scale(y='independent')`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "alt.Chart(df).mark_bar().encode(\n", " y = 'site:N',\n", @@ -732,9 +731,72 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "alt.Chart(df).mark_bar().encode(\n", " x = 'mean(measure):Q',\n", diff --git a/Altair_UStimings.ipynb b/Altair_UStimings.ipynb new file mode 100644 index 0000000..9a95887 --- /dev/null +++ b/Altair_UStimings.ipynb @@ -0,0 +1,505 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import altair as alt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data_set = \"42602\"\n", + "\n", + "dtypes = {\n", + " \"State Name\": \"category\",\n", + " \"County Name\": \"category\",\n", + " \"Site Num\": \"category\"\n", + "}\n", + "\n", + "df = pd.read_csv(\n", + " \"./data/AirDataEPA/hourly_\"+ data_set + \"_2018.csv\",\n", + " parse_dates = {\"tstamp\":[\"Date Local\", \"Time Local\"]},\n", + " dtype = dtypes,\n", + " usecols = list(dtypes) + [\"Sample Measurement\",\"Date Local\",\"Time Local\"]\n", + ").rename(\n", + " columns = {\n", + " \"State Name\": \"state\",\n", + " \"County Name\": \"county\",\n", + " \"Site Num\": \"site\",\n", + " \"Sample Measurement\": \"measure\"\n", + " }\n", + ").set_index('tstamp')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Adding a convenience column with county + site" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sitemeasurestatecountycounty_site
tstamp
2018-01-01 00:00:0000232.4AlabamaJeffersonJefferson 0023
2018-01-01 01:00:0000232.3AlabamaJeffersonJefferson 0023
2018-01-01 02:00:0000232.2AlabamaJeffersonJefferson 0023
2018-01-01 03:00:0000232.7AlabamaJeffersonJefferson 0023
2018-01-01 04:00:0000232.4AlabamaJeffersonJefferson 0023
\n", + "
" + ], + "text/plain": [ + " site measure state county county_site\n", + "tstamp \n", + "2018-01-01 00:00:00 0023 2.4 Alabama Jefferson Jefferson 0023\n", + "2018-01-01 01:00:00 0023 2.3 Alabama Jefferson Jefferson 0023\n", + "2018-01-01 02:00:00 0023 2.2 Alabama Jefferson Jefferson 0023\n", + "2018-01-01 03:00:00 0023 2.7 Alabama Jefferson Jefferson 0023\n", + "2018-01-01 04:00:00 0023 2.4 Alabama Jefferson Jefferson 0023" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['county_site'] = df['county'].astype('str') + \" \" + df['site'].astype('str')\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.5 million rows" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3531277, 5)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### More than 5000 rows need data out of the HTML Altair generates\n", + "\n", + "See solutions to plotting large data sets: https://altair-viz.github.io/user_guide/faq.html#maxrowserror-how-can-i-plot-large-datasets\n", + "\n", + "Here I'm using the data_server solution: https://pypi.org/project/altair-data-server/\n", + "\n", + "```\n", + "pip install altair_data_server\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DataTransformerRegistry.enable('data_server')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.data_transformers.enable('data_server')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(457, 2)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
county_sitemeasure
0Adams 00011.124180
1Adams 300115.664392
2Alameda 00078.866343
3Alameda 00099.986318
4Alameda 001112.317494
\n", + "
" + ], + "text/plain": [ + " county_site measure\n", + "0 Adams 0001 1.124180\n", + "1 Adams 3001 15.664392\n", + "2 Alameda 0007 8.866343\n", + "3 Alameda 0009 9.986318\n", + "4 Alameda 0011 12.317494" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grp = df.groupby(['county_site']).agg({'measure':'mean'}).reset_index()\n", + "print(grp.shape)\n", + "grp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "df_reset = df.reset_index()\n", + "grp_reset = df.groupby(['county_site']).agg({'measure':'mean'}).reset_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Almost immediate rendering" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.Chart(grp_reset).mark_bar().encode(x='measure:Q',y='county_site:N')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Takes about 50 seconds on my desktop machine" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt.Chart(df_reset).mark_bar().encode(x='mean(measure):Q',y='county_site:N')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}