diff --git a/Presentation/Figures/Images/Styling-Line-Graphs/styling_line_graphs_Python.png b/Presentation/Figures/Images/Styling-Line-Graphs/styling_line_graphs_Python.png new file mode 100644 index 00000000..a1fd5be2 Binary files /dev/null and b/Presentation/Figures/Images/Styling-Line-Graphs/styling_line_graphs_Python.png differ diff --git a/Presentation/Figures/styling_line_graphs.html b/Presentation/Figures/styling_line_graphs.html index 1e4385d1..f5e4197a 100644 --- a/Presentation/Figures/styling_line_graphs.html +++ b/Presentation/Figures/styling_line_graphs.html @@ -223,6 +223,115 @@

Implementation

+

+ Python +

+
import pandas as pd
+import seaborn.objects as so
+import numpy as np
+import matplotlib.pyplot as plt
+from seaborn import axes_style
+
+
+
+
+# Download the economics dataset (from ggplot2 so comparison is apples-to-apples)
+url = "https://raw.githubusercontent.com/tidyverse/ggplot2/main/data-raw/economics.csv"
+economics = pd.read_csv(url)
+
+
+# Quick manipulation of dataframe to convert column to datetime
+df = (
+    economics
+    .assign(
+        date = lambda df: pd.to_datetime(df['date'])
+        )
+)
+
+
+
+# Default plots (Notice the xaxis only has 2 years! We'll fix this in p2)
+p1 = (
+    so.Plot(data=df, x='date', y='uempmed')
+    .add(so.Line())
+    )
+p1
+
+
+
+
+## Change line color and chart labels, and fix xaxis
+## Note here that color is inside of the Line call, so this would color the line.
+## If color were instead *inside* the so.Plot() object, SO would assign it
+## a different line for each value of the factor variable (column), colored differently. (Commonly referred to as hue in seaborn)
+# However, in our case, we can pass a color directly.
+p2 = (
+    so.Plot(data=df, x='date', y='uempmed')
+    .add(so.Line(color='purple'))
+    .label(title='Median Duration of Unemploymeny', x='Date', y='')
+    .scale(x=so.Temporal().tick(upto=10)) #Needed for current configuration of seaborn.objects so xaxis prints more than 2 ticks
+    .theme(axes_style("whitegrid")) #use a function from parent seaborn library, that will pass a prebuilt selection based on what you pass
+    )
+
+p2
+
+
+
+## plotting multiple charts (of different line types and sizes)
+p3 = (
+    so.Plot(data=df)
+    .add(so.Line(color='darkblue', linewidth=5), x='date', y='uempmed')
+    .add(so.Line(color='red', linewidth=2, linestyle='dotted'), x='date', y='psavert')
+    .label(title='Unemployment Duration (Blue)\n & Savings Rate (Red)', 
+           x='Date', 
+           y='')
+    .scale(x=so.Temporal().tick(upto=10)) #Needed for current configuration of seaborn.objects so xaxis prints more than 2 ticks
+    .theme(axes_style("whitegrid")) #use a function from parent seaborn library, that will pass a prebuilt selection based on what you pass
+    )
+
+p3
+
+
+## Plotting a different line type for each group
+## There isn't a natural factor in this data so let's just duplicate the data and make one up
+df['fac'] = 1
+df2 = df.copy()
+df2['fac'] = 2
+df2['uempmed'] = df2['uempmed'] - 2 + np.random.normal(size=len(df2))
+df_final = pd.concat([df, df2], ignore_index=True).astype({'fac':'category'})
+
+
+p4 = (
+    so.Plot(data=df_final, x='date', y='uempmed', color='fac')
+    .add(so.Line())
+    .label(title = "Median Duration of Unemployment",
+           x = "Date",
+           y = "",
+           color='Random Factor')
+    .scale(x=so.Temporal().tick(upto=10)) #Needed for current configuration of seaborn.objects so xaxis prints more than 2 ticks
+    .theme(axes_style("whitegrid")) #use a function from parent seaborn library, that will pass a prebuilt selection based on what you pass
+)
+
+p4
+
+
+
+# Plot all 4 plots
+fig, axs = plt.subplots(2, 2, figsize=(10, 8))
+# Draw each plot in the corresponding subplot
+p1.on(axs[0, 0]).plot()
+p2.on(axs[0, 1]).plot()
+p3.on(axs[1, 0]).plot()
+p4.on(axs[1, 1]).plot()
+
+# Adjust layout to avoid overlap
+plt.tight_layout()
+
+# Show the combined plot
+plt.show()
+
+

The four plots generated by the code are (in order p1, p2, then p3 and p4):

+

Four Styled Line Graphs in Python

R

diff --git a/assets/js/search-data.json b/assets/js/search-data.json index a2f39cc2..f126f7b6 100644 --- a/assets/js/search-data.json +++ b/assets/js/search-data.json @@ -4253,246 +4253,252 @@ "url": "/Presentation/Figures/styling_line_graphs.html#implementation", "relUrl": "/Presentation/Figures/styling_line_graphs.html#implementation" },"709": { + "doc": "Styling Line Graphs", + "title": "Python", + "content": "import pandas as pd import seaborn.objects as so import numpy as np import matplotlib.pyplot as plt from seaborn import axes_style # Download the economics dataset (from ggplot2 so comparison is apples-to-apples) url = \"https://raw.githubusercontent.com/tidyverse/ggplot2/main/data-raw/economics.csv\" economics = pd.read_csv(url) # Quick manipulation of dataframe to convert column to datetime df = ( economics .assign( date = lambda df: pd.to_datetime(df['date']) ) ) # Default plots (Notice the xaxis only has 2 years! We'll fix this in p2) p1 = ( so.Plot(data=df, x='date', y='uempmed') .add(so.Line()) ) p1 ## Change line color and chart labels, and fix xaxis ## Note here that color is inside of the Line call, so this would color the line. ## If color were instead *inside* the so.Plot() object, SO would assign it ## a different line for each value of the factor variable (column), colored differently. (Commonly referred to as hue in seaborn) # However, in our case, we can pass a color directly. p2 = ( so.Plot(data=df, x='date', y='uempmed') .add(so.Line(color='purple')) .label(title='Median Duration of Unemploymeny', x='Date', y='') .scale(x=so.Temporal().tick(upto=10)) #Needed for current configuration of seaborn.objects so xaxis prints more than 2 ticks .theme(axes_style(\"whitegrid\")) #use a function from parent seaborn library, that will pass a prebuilt selection based on what you pass ) p2 ## plotting multiple charts (of different line types and sizes) p3 = ( so.Plot(data=df) .add(so.Line(color='darkblue', linewidth=5), x='date', y='uempmed') .add(so.Line(color='red', linewidth=2, linestyle='dotted'), x='date', y='psavert') .label(title='Unemployment Duration (Blue)\\n & Savings Rate (Red)', x='Date', y='') .scale(x=so.Temporal().tick(upto=10)) #Needed for current configuration of seaborn.objects so xaxis prints more than 2 ticks .theme(axes_style(\"whitegrid\")) #use a function from parent seaborn library, that will pass a prebuilt selection based on what you pass ) p3 ## Plotting a different line type for each group ## There isn't a natural factor in this data so let's just duplicate the data and make one up df['fac'] = 1 df2 = df.copy() df2['fac'] = 2 df2['uempmed'] = df2['uempmed'] - 2 + np.random.normal(size=len(df2)) df_final = pd.concat([df, df2], ignore_index=True).astype({'fac':'category'}) p4 = ( so.Plot(data=df_final, x='date', y='uempmed', color='fac') .add(so.Line()) .label(title = \"Median Duration of Unemployment\", x = \"Date\", y = \"\", color='Random Factor') .scale(x=so.Temporal().tick(upto=10)) #Needed for current configuration of seaborn.objects so xaxis prints more than 2 ticks .theme(axes_style(\"whitegrid\")) #use a function from parent seaborn library, that will pass a prebuilt selection based on what you pass ) p4 # Plot all 4 plots fig, axs = plt.subplots(2, 2, figsize=(10, 8)) # Draw each plot in the corresponding subplot p1.on(axs[0, 0]).plot() p2.on(axs[0, 1]).plot() p3.on(axs[1, 0]).plot() p4.on(axs[1, 1]).plot() # Adjust layout to avoid overlap plt.tight_layout() # Show the combined plot plt.show() . The four plots generated by the code are (in order p1, p2, then p3 and p4): . ", + "url": "/Presentation/Figures/styling_line_graphs.html#python", + "relUrl": "/Presentation/Figures/styling_line_graphs.html#python" + },"710": { "doc": "Styling Line Graphs", "title": "R", "content": "## If necessary ## install.packages(c('ggplot2','cowplot')) ## load packages library(ggplot2) ## Cowplot is just to join together the four graphs at the end library(cowplot) ## load data (the Economics dataset comes with ggplot2) eco_df <- economics ## basic plot p1 <- ggplot() + geom_line(aes(x=date, y = uempmed), data = eco_df) p1 ## Change line color and chart labels ## Note here that color is *outside* of the aes() argument, and so this will color the line ## If color were instead *inside* aes() and set to a factor variable, ggplot would create ## a different line for each value of the factor variable, colored differently. p2 <- ggplot() + ## choose a color of preference geom_line(aes(x=date, y = uempmed), color = \"navyblue\", data = eco_df) + ## add chart title and change axes labels labs( title = \"Median Duration of Unemployment\", x = \"Date\", y = \"\") + ## Add a ggplot theme theme_light() ## center the chart title theme(plot.title = element_text(hjust = 0.5)) + p2 ## plotting multiple charts (of different line types and sizes) p3 <-ggplot() + geom_line(aes(x=date, y = uempmed), color = \"navyblue\", size = 1.5, data = eco_df) + geom_line(aes(x=date, y = psavert), color = \"red2\", linetype = \"dotted\", size = 0.8, data = eco_df) + labs( title = \"Unemployment Duration (Blue) and Savings Rate (Red)\", x = \"Date\", y = \"\") + theme_light() + theme(plot.title = element_text(hjust = 0.5)) p3 ## Plotting a different line type for each group ## There isn't a natural factor in this data so let's just duplicate the data and make one up eco_df$fac <- factor(1, levels = c(1,2)) eco_df2 <- eco_df eco_df2$fac <- 2 eco_df2$uempmed <- eco_df2$uempmed - 2 + rnorm(nrow(eco_df2)) eco_df <- rbind(eco_df, eco_df2) p4 <- ggplot() + ## This time, color goes inside aes geom_line(aes(x=date, y = uempmed, color = fac), data = eco_df) + ## add chart title and change axes labels labs( title = \"Median Duration of Unemployment\", x = \"Date\", y = \"\") + ## Add a ggplot theme theme_light() + ## center the chart title theme(plot.title = element_text(hjust = 0.5), ## Move the legend onto some blank space on the diagram legend.position = c(.25,.8), ## And put a box around it legend.background = element_rect(color=\"black\")) + ## Retitle the legend that pops up to explain the discrete (factor) difference in colors ## (note if we just want a name change we could do guides(color = guide_legend(title = 'Random Factor')) instead) scale_color_manual(name = \"Random Factor\", # And specify the colors for the factor levels (1 and 2) by hand if we like values = c(\"1\" = \"red\", \"2\" = \"blue\")) p4 # Put them all together with cowplot for LOST upload plot_grid(p1,p2,p3,p4, nrow=2) . The four plots generated by the code are (in order p1, p2, then p3 and p4): . ", "url": "/Presentation/Figures/styling_line_graphs.html#r", "relUrl": "/Presentation/Figures/styling_line_graphs.html#r" - },"710": { + },"711": { "doc": "Styling Line Graphs", "title": "Stata", "content": "In Stata, one can create plot lines using the command line, which in combination with twoway allows you to modify components of sub-plots individually. In this demonstration, I will use minimal formatting, but will apply minimal modifications using Ben Jann’s grstyle. ** Setup: Install grstyle ssc install grstyle grstyle init grstyle color background white grstyle set legend, nobox . Setup . First, you need to load the data into Stata. The data is a copy from the data economics available within ggplot package, and translated using foreign. use https://friosavila.github.io/playingwithstata/rnd_dta/economics, clear ** Since this was taken directly from R, the date variable will not be formatted. ** We can format the date using the following. format date %tdCCYY ** This indicates to create a _mask_, to put on top of \"data\" ** but only display the \"year\" . Simple line plot . Now, For a simple plot, we could use the following syntax: . line yvar1 [yvar2 yvar3 ...] xvar1 . This requests plotting all variables yvarX against xvar1 (horizontal axis). Internally, the command connects every pair of data [yvar1,xvar1] sequentially, based on the order they appear in the dataset. Below, we can do that, plotting unemployment duration uempmed vs date. line uempmed date . Something to keep in mind. If the dataset is not sorted by date, you may end up with a lineplot that is all over the place. For example: . sort uempmed line uempmed date . To avoid this, it is recommended to use the option sort. line uempmed date, sort . Adding titles, and axis titles . The next thing you may want to do is add information to the plot, so its easier to understand what the figure is showing. Specifically, we can add information on the vertical axis using ytitle(). I will also use xtitle() to drop the horizontal axis information, and add a title title(). line uempmed date, sort /// ytitle(\"# of weeks\") xtitle(\"\") /// title(Unemployment Duration) . Changing Line characteristics. It is also possible to modify the line width lwidth(), line color lcolor(), and line pattern lpattern(). To show how this can affect the plot, below 4 examples are provided. Notice that each plot is saved in memory using name(), and all are combined using graph combine. line uempmed date, sort /// ytitle(\"# of weeks\") xtitle(\"\") /// title(Unemployment Duration 1) /// lwidth(.5) lcolor(red) lpattern(solid) name(m1,replace) line uempmed date, sort /// ytitle(\"# of weeks\") xtitle(\"\") /// title(Unemployment Duration 2) /// lwidth(.25) lcolor(gold) lpattern(dash) name(m2,replace) line uempmed date, sort /// ytitle(\"# of weeks\") xtitle(\"\") /// title(Unemployment Duration 3) /// lwidth(1) lcolor(\"68 170 153\") lpattern(dot) name(m3,replace) line uempmed date, sort /// ytitle(\"# of weeks\") xtitle(\"\") /// title(Unemployment Duration 4) /// lwidth(.5) lcolor(navy%50) lpattern(dash_dot) name(m4,replace) graph combine m1 m2 m3 m4 . Ploting Multiple Lines, and different axis . You may also want to plot multiple variables in the same figure. There are two ways to do this: . twoway (line uempmed date, sort lwidth(.75) lpattern(solid) ) /// (line psavert date, sort lwidth(.25) lpattern(dash) ), /// legend (order(1 \"Unemployment duration\" 2 \"Saving rate\")) line uempmed psavert date, sort lwidth(0.75 .25) lpattern(solid dash) /// legend(order(1 \"Unemployment duration\" 2 \"Saving rate\")) . Both options provide the same figure, however, I prefer the first option since that allows for more flexibility. You can also choose to plot each variable in a different axis. Each axis can have its own title. twoway (line uempmed date, sort lwidth(.75) lpattern(solid) yaxis(1)) /// (line psavert date, sort lwidth(.25) lpattern(dash) yaxis(2)), /// legend(order(1 \"Unemployment duration\" 2 \"Saving rate\")) /// ytitle(Weeks ,axis(1) ) ytitle(Interest rate,axis(2) ) . Adding informative Vertical lines. Finally, it is possible to add vertical lines. This may be useful, for example, to differentiate the great recession period. Additionally, in this plot, I add a note. twoway (line uempmed date, sort lwidth(.75) lpattern(solid) yaxis(1)) /// (line psavert date, sort lwidth(.25) lpattern(dash) yaxis(2)), /// legend(order(1 \"Unemployment duration\" 2 \"Saving rate\")) /// ytitle(Weeks ,axis(1) ) ytitle(Interest rate,axis(2) ) /// xline(`=td(1dec2007)'/`=td(30jun2008)', lcolor(gs8)) /// note(\"Note:Grey Area marks the Great recession Period\") /// title(\"Unemployement Duration and\" \"Saving Rate\") . ", "url": "/Presentation/Figures/styling_line_graphs.html#stata", "relUrl": "/Presentation/Figures/styling_line_graphs.html#stata" - },"711": { + },"712": { "doc": "Graphing a By-Group or Over-Time Summary Statistic", "title": "Graphing a By-Group or Over-Time Summary Statistic", "content": "A common task in exploring or presenting data is looking at by-group summary statistics. This commonly takes the form of a graph where the group is along the x-axis and the summary statistic is on the y-axis. Often this group might be a time period so as to look at changes over time. Producing such a graph requires three things: . | A decision of what kind of graph will be produced (line graph, bar graph, scatterplot, etc.) | The creation of the grouped summary statistic | The creation of the graph itself | . ", "url": "/Presentation/Figures/summary_graphs.html", "relUrl": "/Presentation/Figures/summary_graphs.html" - },"712": { + },"713": { "doc": "Graphing a By-Group or Over-Time Summary Statistic", "title": "Keep in Mind", "content": ". | Line graphs are only intended for use in cases where the x-axis variable (1) is ordinal (one value is “more” than another), and (2) takes consistently-sized jumps from one observation to the next. A time-based x-axis is a good candidate for use with line graphs. If your group is categorical and doesn’t follow a natural ordering, then do not use a line graph. Consider a bar graph or some other kind of graph instead. | If you are making a graph for presentation rather than exploration, and your x-axis variable is categorical and doesn’t have a natural ordering, your graph will often be easier to read if the x-axis is sorted by the height of the y-axis. The way to do this will be demonstrated in the code examples below. | . ", "url": "/Presentation/Figures/summary_graphs.html#keep-in-mind", "relUrl": "/Presentation/Figures/summary_graphs.html#keep-in-mind" - },"713": { + },"714": { "doc": "Graphing a By-Group or Over-Time Summary Statistic", "title": "Also Consider", "content": ". | This page will cover how to calculate the summary statistic in the graph code itself. However, an alternate approach that provides a bit more control and flexibility is to calculate the by-group summary statistic by collapsing the data set so there is only one observation per group in the data. Then, just make a regular graph of whatever kind you like, with the group along the x-axis, and the summary statistic on the y-axis. See Line Graphs or Bar Graphs. | If you want a version of these graphs that has two groupings - one group along the x-axis and with different bars or lines for another group, see how to graph multiple lines on Line Graphs or multiple bars per x-axis point on Bar Graphs. | . ", "url": "/Presentation/Figures/summary_graphs.html#also-consider", "relUrl": "/Presentation/Figures/summary_graphs.html#also-consider" - },"714": { + },"715": { "doc": "Graphing a By-Group or Over-Time Summary Statistic", "title": "Implementations", "content": " ", "url": "/Presentation/Figures/summary_graphs.html#implementations", "relUrl": "/Presentation/Figures/summary_graphs.html#implementations" - },"715": { + },"716": { "doc": "Graphing a By-Group or Over-Time Summary Statistic", "title": "R", "content": "# We want ggplot2 for graphing and dplyr for the storms data library(tidyverse) data(storms) # First, a line graph with time on the x-axis # This uses stat_summary # Note that stat_summary_bin is also available, # which first bins the x-axis, if desired # Put the time variable in the x aesthetic, and the # variable to be summarized in y ggplot(storms, aes(x = year, y = wind)) + stat_summary(geom = 'line', # Do we want a line graph? Point? fun = mean) + # What function should be used to summarize? # Note another good option for geom is 'pointrange', the default # which you can get from just stat_summary(), # which also shows the range of data # Just decoration: labs(x = 'Year', y = 'Average Wind Speed', title = 'Average Wind Speed of Storms by Year') + theme_minimal() # Second, a bar graph with a category on the x-axis # Use reorder() to sort by which status has the most wind ggplot(storms, aes(x = reorder(status,-wind), y = wind)) + stat_summary(geom = 'bar', # Do we want a line graph? Point? fun = mean) + # Decoration: scale_x_discrete(labels = c('Hurricane','Tropical Storm','Tropical Depression')) + # make the labels more presentable # Decoration: labs(x = NULL, y = 'Average Wind Speed', title = 'Average Wind Speed by Storm Type') + theme_minimal() . This code produces: . ", "url": "/Presentation/Figures/summary_graphs.html#r", "relUrl": "/Presentation/Figures/summary_graphs.html#r" - },"716": { + },"717": { "doc": "Graphing a By-Group or Over-Time Summary Statistic", "title": "Stata", "content": "In Stata there is not a single graph command that will graph a summary statistic line graph for us (although there is for bar graphs). Instead, for line graphs, we must collapse the data set and graph the result. You could avoid collapsing by instead using bysort group: egen newvar = mean(oldvar) (or some egen function from help egen other than mean) to create by-group statistics in the original data, use egen tag = tag(group) to select only one observation per group, and then do the below graphing commands while adding if tag == 1 to them. ** Read in the data import delimited \"https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/storms.csv\", clear * Keep the original data to return to after collapsing preserve * First, a line graph with time on the x-axis and average wind on y collapse (mean) wind, by(year) * Then, a line graph tw line wind year, xtitle(\"Year\") ytitle(\"Average Wind Speed\") restore * Now, a bar graph with a category on the x-axis graph bar (mean) wind, over(status, relabel(1 \"Hurricane\" 2 \"Tropical Depression\" 3 \"Tropical Storm\") /// Relabel the statuses to capitalize sort((mean) wind)) /// Put in height order automatically ytitle(\"Average Wind Speed\") . This code produces: . ", "url": "/Presentation/Figures/summary_graphs.html#stata", "relUrl": "/Presentation/Figures/summary_graphs.html#stata" - },"717": { + },"718": { "doc": "Support Vector Machine", "title": "Support Vector Machine", "content": "A support vector machine (hereinafter, SVM) is a supervised machine learning algorithm in that it is trained by a set of data and then classifies any new input data depending on what it learned during the training phase. SVM can be used both for classification and regression problems but here we focus on its use for classification. The idea is to separate two distinct groups by maximizing the distance between those points that are most hard to classify. To put it more formally, it maximizes the distance or margin between support vectors around the separating hyperplane. Support vectors here imply the data points that lie closest to the hyperplane. Hyperplanes are decision boundaries that are represented by a line (in two dimensional space) or a plane (in three dimensional space) that separate the two groups. Suppose a hypothetical problem of classifying apples from lemons. Support vectors in this case are apples that look closest to lemons and lemons that look closest to apples. They are the most difficult ones to classify. SVM draws a separating line or hyperplane that maximizes the distance or margin between support vectors, in this case the apples that look closest to the lemons and lemons that look closest to apples. Therefore support vectors are critical in determining the position as well as the slope of the hyperplane. For additional information about the support vector regression or support vector machine, refer to Wikipedia: Support-vector machine. ", "url": "/Machine_Learning/support_vector_machine.html", "relUrl": "/Machine_Learning/support_vector_machine.html" - },"718": { + },"719": { "doc": "Support Vector Machine", "title": "Keep in Mind", "content": ". | Note that optimization problem to solve for a linear separator is maximizing the margin which could be calculated as \\(\\frac{2}{\\lVert w \\rVert}\\). This could then be rewritten as minimizing \\(\\lVert w \\rVert\\), or minimizing a monotonic transformation version of it expressed as \\(\\frac{1}{2}\\lVert w \\rVert^2\\). Additional constraint of \\(y_i(w^T x_i + b) \\geq 1\\) needs to be imposed to ensure that the data points are still correctly classified. As such, the constrained optimization problem for SVM looks as the following: | . \\[\\text{min} \\frac{\\lVert w \\rVert ^2}{2}\\] s.t. \\(y_i(w^T x_i + b) \\geq 1\\), . where \\(w\\) is a weight vector, \\(x_i\\) is each data point, \\(b\\) is bias, and \\(y_i\\) is each data point’s corresponding label that takes the value of either \\(\\{-1, 1\\}\\). For detailed information about derivation of the optimization problem, refer to MIT presentation slides, The Math Behind Support Vector Machines, and Demystifying Maths of SVM - Part1. | If data points are not linearly separable, non-linear SVM introduces higher dimensional space that projects data points from original finite-dimensional space to gain linearly separation. Such process of mapping data points into a higher dimensional space is known as the Kernel Trick. There are numerous types of Kernels that can be used to create higher dimensional space including linear, polynomial, Sigmoid, and Radial Basis Function. | Setting the right form of Kernel is important as it determines the structure of the separator or hyperplane. | . ", "url": "/Machine_Learning/support_vector_machine.html#keep-in-mind", "relUrl": "/Machine_Learning/support_vector_machine.html#keep-in-mind" - },"719": { + },"720": { "doc": "Support Vector Machine", "title": "Also Consider", "content": ". | See the alternative classification method described on the K-Nearest Neighbor Matching. | . ", "url": "/Machine_Learning/support_vector_machine.html#also-consider", "relUrl": "/Machine_Learning/support_vector_machine.html#also-consider" - },"720": { + },"721": { "doc": "Support Vector Machine", "title": "Implementations", "content": " ", "url": "/Machine_Learning/support_vector_machine.html#implementations", "relUrl": "/Machine_Learning/support_vector_machine.html#implementations" - },"721": { + },"722": { "doc": "Support Vector Machine", "title": "Python", "content": "In this example, we will use scikit-learn, which is a very popular Python library for machine learning. We will look at two support vector machine models: LinearSVC, which performs linear support vector classification (example 1); and SVC, which can accept several different kernels (including non-linear ones). For the latter case, we’ll use the non-linear radial basis function kernel (example 2 below). The last part of the code example plots the decision boundary, ie the support vectors, for the second example. from sklearn.datasets import make_classification, make_gaussian_quantiles from sklearn.svm import LinearSVC, SVC from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import numpy as np ########################### # Example 1: Linear SVM ### ########################### # Generate linearly separable data: X, y = make_classification(n_features=2, n_redundant=0, n_informative=1, n_clusters_per_class=1) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2) # Train linear SVM model svm = LinearSVC(tol=1e-5) svm.fit(X_train, y_train) # Test model test_score = svm.score(X_test, y_test) print(f'The test score is {test_score}') ############################### # Example 2: Non-linear SVM ### ############################### # Generate non-linearly separable data X, y = make_gaussian_quantiles(n_features=2, n_classes=2) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2) # Train non-linear SVM model nl_svm = SVC(kernel='rbf', C=50) nl_svm.fit(X_train, y_train) # Test model test_score = nl_svm.score(X_test, y_test) print(f'The non-linear test score is {test_score}') #################################### # Plot non-linear SVM boundaries ### #################################### plt.figure() decision_function = nl_svm.decision_function(X) support_vector_indices = np.where( np.abs(decision_function) <= 1 + 1e-15)[0] support_vectors = X[support_vector_indices] plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired) ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 50), np.linspace(ylim[0], ylim[1], 50)) Z = nl_svm.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) plt.scatter(support_vectors[:, 0], support_vectors[:, 1], s=100, linewidth=1, facecolors='none', edgecolors='k') plt.tight_layout() plt.show() . ", "url": "/Machine_Learning/support_vector_machine.html#python", "relUrl": "/Machine_Learning/support_vector_machine.html#python" - },"722": { + },"723": { "doc": "Support Vector Machine", "title": "R", "content": "There are a couple of ways to implement SVM in R. Here we’ll demonstrate using the e1071 package. To learn more about the package, check out its CRAN page, as well as this vignette. Note that we’ll also load the tidyverse to help with some data wrangling and plotting. Two examples are shown below that use linear SVM and non-linear SVM respectively. The first example shows how to implement linear SVM. We start by constructing data, separating them into training and test set. Using the training set, we fit the data using the svm() function. Notice that kernel argument for svm() function is specified as linear for our first example. Next, we predict the test data based on the model estimates using the predict() function. The first example result suggests that only one out of 59 data points is incorrectly classified. The second example shows how to implement non-linear SVM. The data in example two is generated in a way to have data points of one class centered around the middle whereas data points of the other class spread on two sides. Notice that kernel argument for the svm() function is specified as radial for our second example, based on the shape of the data. The second example result suggests that only two out of 58 data points are incorrectly classified. # Install and load the packages if (!require(\"tidyverse\")) install.packages(\"tidyverse\") if (!require(\"e1071\")) install.packages(\"e1071\") library(tidyverse) # package for data manipulation library(e1071) # package for SVM ########################### # Example 1: Linear SVM ### ########################### # Construct a completely separable data set ## Set seed for replication set.seed(0715) ## Make variable x x = matrix(rnorm(200, mean = 0, sd = 1), nrow = 100, ncol = 2) ## Make variable y that labels x by either -1 or 1 y = rep(c(-1, 1), c(50, 50)) ## Make x to have unilaterally higher value when y equals 1 x[y == 1,] = x[y == 1,] + 3.5 ## Construct data set d1 = data.frame(x1 = x[,1], x2 = x[,2], y = as.factor(y)) ## Split it into training and test data flag = sample(c(0,1), size = nrow(d1), prob=c(0.5,0.5), replace = TRUE) d1 = setNames(split(d1, flag), c(\"train\", \"test\")) # Plot ggplot(data = d1$train, aes(x = x1, y = x2, color = y, shape = y)) + geom_point(size = 2) + scale_color_manual(values = c(\"darkred\", \"steelblue\")) # SVM classification svmfit1 = svm(y ~ ., data = d1$train, kernel = \"linear\", cost = 10, scale = FALSE) print(svmfit1) plot(svmfit1, d1$train) # Predictability pred.d1 = predict(svmfit1, newdata = d1$test) table(pred.d1, d1$test$y) ############################### # Example 2: Non Linear SVM ### ############################### # Construct less separable data set ## Make variable x x = matrix(rnorm(200, mean = 0, sd = 1), nrow = 100, ncol = 2) ## Make variable y that labels x by either -1 or 1 y <- rep(c(-1, 1) , c(50, 50)) ## Make x to have extreme values when y equals 1 x[y == 1, ][1:25,] = x[y==1,][1:25,] + 3.5 x[y == 1, ][26:50,] = x[y==1,][26:50,] - 3.5 ## Construct data set d2 = data.frame(x1 = x[,1], x2 = x[,2], y = as.factor(y)) ## Split it into training and test data d2 = setNames(split(d2, flag), c(\"train\", \"test\")) # Plot data ggplot(data = d2$train, aes(x = x1, y = x2, color = y, shape = y)) + geom_point(size = 2) + scale_color_manual(values = c(\"darkred\", \"steelblue\")) # SVM classification svmfit2 = svm(y ~ ., data = d2$train, kernel = \"radial\", cost = 10, scale = FALSE) print(svmfit2) plot(svmfit2, d2$train) # Predictability pred.d2 = predict(svmfit2, newdata = d2$test) table(pred.d2, d2$test$y) . ", "url": "/Machine_Learning/support_vector_machine.html#r", "relUrl": "/Machine_Learning/support_vector_machine.html#r" - },"723": { + },"724": { "doc": "Support Vector Machine", "title": "Stata", "content": "The below code shows how to implement support vector machines in Stata using the svmachines command. To learn more about this community contriuted command, you can read this Stata Journal article. clear all set more off *Install svmachines ssc install svmachines *Import Data with a binary outcome for classification use http://www.stata-press.com/data/r16/fvex.dta, clear *First try logistic regression to benchmark the prediction quality of SVM against logit outcome group sex arm age distance y // Run the regression predict outcome_predicted // Generate predictions from the regression *Calculate the log loss - see https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html for more info gen log_loss = outcome*log(outcome_predicted)+(1-outcome)*log(1-outcome_predicted) *Run SVM svmachines outcome group sex arm age distance y, prob // Specifiying the prob option to generate predicted probabilities in the next line predict sv_outcome_predicted, probability . Next we will Calculate the log loss (or cross-entropy loss) for SVM. Note: Predictions following svmachines generate three variables from the stub you provide in the predict command (in this case sv_outcome_predicted). The first is just the same as the stub and stores the best-guess classification (the group with the highest probability out of the possible options). The next n variables store the probability that the given observation will fall into each of the possible classes (in the binary case, this is just n=2 possible classes). These new variables are the stub + the value of each class. In the case below, the suffixes are _0 and _1. We use sv_outcome_predicted_1 because it produces probabilities that are equivalent in their intepretation (probability of having a class of 1) to the probabilities produced by the logit model and that can be used in calculating the log loss. Calculating loss functions for multi-class classifiers is more complicated, and you can read more about that at the link above. gen log_loss_svm = outcome*log(sv_outcome_predicted_1)+(1-outcome)*log(1-sv_outcome_predicted_1) *Show log loss for both logit and SVM, remember lower is better sum log_loss log_loss_svm . ", "url": "/Machine_Learning/support_vector_machine.html#stata", "relUrl": "/Machine_Learning/support_vector_machine.html#stata" - },"724": { + },"725": { "doc": "Synthetic Control", "title": "Synthetic Control Method (SCM)", "content": "Synthetic Control Method is a way of estimating the causal effect of an intervention in comparative case studies. It is typically used with a small number of large units (e.g. countries, states, counties) to estimate the effects of aggregate interventions. The idea is to construct a convex combination of similar untreated units (often referred to as the “donor pool”) to create a synthetic control that closely resembles the treatment subject and conduct counterfactual analysis with it. We have \\(j = 1, 2, ..., J+1\\) units, assuming without loss of generality that the first unit is the treated unit, \\(Y_{1t}\\). Denoting the potential outcome without intervention as \\(Y_{1t}^N\\), our goal is to estimate the treatment effect: . \\[\\tau_{1t} = Y_{1t} - Y_{1t}^N\\] We won’t have data for \\(Y_{1t}^N\\) but we can use synthetic controls to estimate it. Let the \\(k\\) x \\(J\\) matrix \\(X_0 = [X_2 ... X_{J+1}]\\) represent characteristics for the untreated units and the \\(k\\)-length vector \\(X_1\\) represent characteristics for the treatment unit. Last, define our \\(J\\times 1\\) vector of weights as \\(W = (w_2, ..., w_{J+1})'\\). Recall, these weights are used to form a convex combination of the untreated units. Now we have our estimate for the treatment effect: . \\[\\hat{\\tau_{1t}} = Y_{1t} - \\hat{Y_{1t}^N}\\] where \\(\\hat{Y_{1t}^N} = \\sum_{j=2}^{J+1} w_j Y_{jt}\\). The matrix of weights is found by choosing \\(W*\\) to minimize \\(\\|X_1 - X_0W\\|\\) such that \\(W >> 0\\) and \\(\\sum_2^{J+2} w_j = 1\\). Once you’ve found the \\(W*\\), you can put together an estimated \\(\\hat{Y_{1t}}\\) (synthetic control) for all time periods \\(t\\). Because our synthetic control was constructed from untreated units, when the intervention occurs at time \\(T_0\\), the difference between the synthetic control and the treated unit gives us our estimated treatment effect. As a last bit of intuition, below is a graph depicting the upshot of the method. The synthetic control follows a very similar path to the treated unit pre-intervention. The difference between the two curves, post-intervention, gives us our estimated treatment effect. Here is an excellent resource by Alberto Abadie (the economist who developed the method) if you’re interested in getting a more comprehensive overview of synthetic controls. ", "url": "/Model_Estimation/Research_Design/synthetic_control_method.html#synthetic-control-method-scm", "relUrl": "/Model_Estimation/Research_Design/synthetic_control_method.html#synthetic-control-method-scm" - },"725": { + },"726": { "doc": "Synthetic Control", "title": "Keep in Mind", "content": ". | Unlike the difference-in-difference method, parallel trends aren’t a necessary assumption. However, the donor pool must still share similar characteristics to the treatment unit in order to construct an accurate estimate. | Panel data is necessary for the synthetic control method and, typically, requires observations over many time periods. Specifically, the pre-intervention time frame ought to be large enough to form an accurate estimate. | Aggregate data is required for this method. Examples include state-level per-capita GDP, country-level crime rates, and state-level alcohol consumption statistics. Additionally, if aggregate data doesn’t exist, you can sometimes aggregate micro-level data to estimate aggregate values. | As a caveat to the previous bullet point, be wary of structural breaks when using large pre-intervention periods. | Abadie and L’Hour (2020) also proposes a penalization method for performing the synthetic control method on disaggregated data. | . ", "url": "/Model_Estimation/Research_Design/synthetic_control_method.html#keep-in-mind", "relUrl": "/Model_Estimation/Research_Design/synthetic_control_method.html#keep-in-mind" - },"726": { + },"727": { "doc": "Synthetic Control", "title": "Also Consider", "content": ". | As stated before, this technique can be compared to difference-in-difference. If you don’t have aggregate data or don’t have sufficient data for the pre-intervention window and you have a control that you can confidently assume has a parallel trend to the treatment unit, then diff-in-diff might be better suited than SCM. | . ", "url": "/Model_Estimation/Research_Design/synthetic_control_method.html#also-consider", "relUrl": "/Model_Estimation/Research_Design/synthetic_control_method.html#also-consider" - },"727": { + },"728": { "doc": "Synthetic Control", "title": "Implementations", "content": " ", "url": "/Model_Estimation/Research_Design/synthetic_control_method.html#implementations", "relUrl": "/Model_Estimation/Research_Design/synthetic_control_method.html#implementations" - },"728": { + },"729": { "doc": "Synthetic Control", "title": "R", "content": "To implement the synthetic control method in R, we will be using the package Synth. While not used here, the SynthTools package also has a number of functions for making it easier to work with the Synth package. As stated above, the key part of the synthetic control method is to estimate the weight matrix \\(W*\\) in order to form accurate estimates of the treatment unit. The Synth package provides you with the tools to find the weight matrix. From there, you can construct the synthetic control by interacting the \\(W*\\) and the \\(Y\\) values from the donor pool. # First we will load Synth and dplyr. # If you haven't already installed Synth, now would be the time to do so library(dplyr) library(Synth) # We're going to use simulated data included in the Synth package for our example. # This dataframe consists of panel data including 1 outcome variable and 3 predictor variables for 1 treatment unit and 7 control units (donor pool) over 21 years data(\"synth.data\") # The primary function that we will use is the synth() function. # However, this function needs four particularly formed matrices as inputs, so it is highly recommended that you use the dataprep() function to generate the inputs. # Once we've gathered our dataprep() output, we can just use that as our sole input for synth() and we'll be good to go. # One important note is that your data must be in long format with id variables (integers) and name variables (character) for each unit. dataprep_out = dataprep( foo = synth.data, # first input is our data predictors = c(\"X1\", \"X2\", \"X3\"), # identify our predictor variables predictors.op = \"mean\", # operation to be performed on the predictor variables for when we form our X_1 and X_0 matrices. time.predictors.prior = c(1984:1989), # pre-intervention window dependent = \"Y\", # outcome variable unit.variable = \"unit.num\", # identify our id variable unit.names.variable = \"name\", # identify our name variable time.variable = \"year\", # identify our time period variable treatment.identifier = 7, # integer that indicates the id variable value for our treatment unit controls.identifier = c(2, 13, 17, 29, 32, 36, 38), # vector that indicates the id variable values for the donor pool time.optimize.ssr = c(1984:1990), # identify the time period you want to optimize over to find the W*. Includes pre-treatment period and the treatment year. time.plot = c(1984:1996) # periods over which results are to be plotted with Synth's plot functions ) # Now we have our data ready in the form of a list. We have all the matrices we need to run synth() # Our output from the synth() function will be a list that includes our optimal weight matrix W* synth_out = dataprep_out %>% synth() # From here, we can plot the treatment variable and the synthetic control using Synth's plot function. # The variable tr.intake is an optional variable if you want a dashed vertical line where the intervention takes place. synth_out %>% path.plot(dataprep.res = dataprep_out, tr.intake = 1990) # Finally, we can construct our synthetic control variable if we wanted to conduct difference-in-difference analysis on it to estimate the treatment effect. synth_control = dataprep_out$Y0plot %*% synth_out$solution.w . ", "url": "/Model_Estimation/Research_Design/synthetic_control_method.html#r", "relUrl": "/Model_Estimation/Research_Design/synthetic_control_method.html#r" - },"729": { + },"730": { "doc": "Synthetic Control", "title": "Stata", "content": "To implement the synthetic control method in Stata, we will be using the synth and synth_runner packages. For a short tutorial on how to carry out the synthetic control method in Stata by Jens Hainmueller, there is a useful video here. *Install plottig graph scheme used below ssc install blindschemes *Install synth and synth_runner if they're not already installed (uncomment these to install) * ssc install synth, all * cap ado uninstall synth_runner //in-case already installed * net install synth_runner, from(https://raw.github.com/bquistorff/synth_runner/master/) replace *Import Dataset sysuse synth_smoking.dta, clear *Need to set the data as time series, using tsset tsset state year . Next we will run the synthetic control analysis using synth_runner, which adds some useful options for estimation. Note that this example uses the pre-treatment outcome for just three years (1988, 1980, and 1975), but any combination of pre-treatment outcome years can be specified. The nested option specifies a more computationally intensive but comprehensive method for estimating the synthetic control. The trunit() option specifies the ID of the treated entity (in this case, the state of California has an ID of 3). synth cigsale beer lnincome retprice age15to24 cigsale(1988) /// cigsale(1980) cigsale(1975), trunit(3) trperiod(1989) fig /// nested keep(synth_results_data.dta) replace /*Keeping the synth_results_data.dta stores a dataset of all the time series values of cigsale for each year for California (observed) and synthetic California (constructed using a weighted average of observed data from donor states). We can then import this dataset to create a synth plot whose attributes we can control. */ use synth_results_data.dta, clear drop _Co_Number _W_Weight // Drops the columns of the data that store the donor state weights twoway line (_Y_treated _Y_synthetic _time), scheme(plottig) xline(1989) /// xtitle(Year) ytitle(Cigarette Sales) legend(pos(6) rows(1)) ** Run the analysis using synth_runner *Import Dataset sysuse synth_smoking.dta, clear *Need to set the data as time series, using tsset tsset state year *Estimate Synthetic Control using synth_runner synth_runner cigsale beer(1984(1)1988) lnincome(1972(1)1988) retprice age15to24 cigsale(1988) cigsale(1980) /// cigsale(1975), trunit(3) trperiod(1989) gen_vars . We can plot the effects in two ways: displaying both the treated and synthetic time series together and displaying the difference between the two over the time series. The first plot is equivalent to the plot produced by specifying the fig option for synth, except you can control aspects of the figure. For both plots you can control the plot appearence by specifying effect_options() or tc_options(), depending on which plot you would like to control. effect_graphs, trlinediff(-1) effect_gname(cigsale1_effect) tc_gname(cigsale1_tc) /// effect_options(scheme(plottig)) tc_options(scheme(plottig)) /*Graph the outcome paths of all units and (if there is only one treated unit) a second graph that shows prediction differences for all units */ single_treatment_graphs, trlinediff(-1) raw_gname(cigsale1_raw) /// effects_gname(cigsale1_effects) effects_ylabels(-30(10)30) /// effects_ymax(35) effects_ymin(-35) . ", "url": "/Model_Estimation/Research_Design/synthetic_control_method.html#stata", "relUrl": "/Model_Estimation/Research_Design/synthetic_control_method.html#stata" - },"730": { + },"731": { "doc": "Synthetic Control", "title": "Synthetic Control", "content": " ", "url": "/Model_Estimation/Research_Design/synthetic_control_method.html", "relUrl": "/Model_Estimation/Research_Design/synthetic_control_method.html" - },"731": { + },"732": { "doc": "Task Scheduling with Github Actions", "title": "The Problem We’ll Solve", "content": "The United States Substance Abuse and Mental Health Services Administration (SAMHSA) is an agency inside the U.S. Department of Health and Human Services tasked with overseeing the country’s substance abuse and mental health initiatives. A major one of these initiatives is maintaining the list of “waived providers” who can prescribe opioids, something that is typically prohibited under the federal Controlled Substances Act. SAMHSA makes available a list of currently waived providers, but does not publish (at least easily) historical lists of providers. As such, we’ll write a small web scraper that pulls all the data from their website and writes it out to a CSV. This article, however, is not about web scrapers. Instead, our problem is that SAMHSA seems to update the list without fanfare at irregular intervals. So we would like to scrape their website every day. This article demonstrates how set up a Github repo to do just that. ", "url": "/Other/task_scheduling_with_github_actions.html#the-problem-well-solve", "relUrl": "/Other/task_scheduling_with_github_actions.html#the-problem-well-solve" - },"732": { + },"733": { "doc": "Task Scheduling with Github Actions", "title": "Requirements", "content": "You’ll need: . | A Github account and some familiarity with git | A program that can be run on the command line that accomplishes your data gathering task | The requirements for that program enumerated in one of several standard ways | . For the rest of this section, we’ll focus a bit on requirements (2) and (3). Requirement (2): A command line program . What you’ll be able to tell Github to do is run a series of commands. It is best to package these up into one command that will do everything for you. For instance, if you’re using python, you will probably want to have a file called main.py that looks something like this: . import csv import sys from datetime import datetime from typing import List, Union import requests URL = \"https://whereveryourdatais.com/\" def process_page(html: str) -> List[List[Union[int, str]]]: \"\"\" This is the meat of your web scraper: Pulling out the data you want from the HTML of the web page \"\"\" def pull_data(url: str) -> List[List[Union[int, str]]]: resp = requests.get(url) resp.raise_for_status() content = resp.content.decode('utf8') return process_page(content) def main(): # The program takes 1 optional argument: an output filename. If not present, # we will write the output a default filename, which is: filename = f\"data/output-{datetime.utcnow().strftime('%Y-%m-%d').csv\" if len(sys.argv) > 1: filename = sys.argv[1] print(f\"Will write data to {filename}\") print(f\"Pulling data from {URL}...\") data = pull_data(URL) print(f\"Done pulling data.\") print(\"Writing data...\") with open(filename, 'wt') as outfile: writer = csv.writer(outfile) writer.writerows(data) print(\"Done writing data.\") if __name__ == \"__main__\": main() . Here the meat of your web scraper goes into the pull_data and the process_page functions. These are then wrapped into the main function which you can call on the command line as: . python3 main.py . Similarly, if you’re using R, you’ll want to create a main.R file to similar effect. For instance, it might look something like: . library(readr) library(httr) URL <- \"https://whereveryourdatais.com/\" #' This hte meat of your web scraper: #' Pulling out the data you want from the HTML of the web page process_page <- function(html) { # Process html } #' Pull data from a single URL and return a tibble with it nice and ordered pull_data <- function(url) { resp <- GET(url) if (resp$status_code >= 400) { stop(paste0(\"Something bad occurred in trying to pull \", URL)) } return(process_page(content(resp))) } main <- function() { # The program takes 1 optional argument: an output filename. If not present, # we will write the output a default filename, which is: date <- Sys.time() attr(date, \"tzone\") <- \"UTC\" filename <- paste0(\"data/output-\", as.Date(date, format = \"%Y-%m-%d\")) args <- commandArgs(trailingOnly = TRUE) if (length(args) > 0) { filename <- args[1] } print(paste0(\"Will write data to \", filename)) print(paste0(\"Pulling data from \", URL)) data <- pull_data(URL) print(\"Done pulling data\") print(\"Writing data...\") write_csv(data, filename) print(\"Done writing data.\") } . Here the meat of your web scraper goes into the pull_data and the process_page functions. These are then wrapped into the main function which you can call on the command line as (note the --vanilla): . Rscript --vanilla main.R . Requirement (3): Enumerated lists of requirements . In order for Github to run your command, it will need to know what dependencies it needs to install. For experts, using a tool like poetry in Python or renv in R is probably what you actually want to do. However, for the purposes of this article, we’ll stick to a simple list. As such, you should create a file entitled requirements.txt in your project’s main folder. In this you should list, one requirement per line, the requirements of your script. For instance, in the python example above, your requirements.txt should look like . requests . The R example should have . httr readr . If you’re using R, you’ll also need to add the following script in a file called install.R to your project: . CRAN <- \"https://mirror.las.iastate.edu/CRAN/\" process_file <- function(filepath) { con <- file(filepath, \"r\") while (TRUE) { line <- trimws(readLines(con, n = 1)) if (length(line) == 0) { break } install.packages(line, repos = CRAN) } close(con) } process_file(\"requirements.txt\") . ", "url": "/Other/task_scheduling_with_github_actions.html#requirements", "relUrl": "/Other/task_scheduling_with_github_actions.html#requirements" - },"733": { + },"734": { "doc": "Task Scheduling with Github Actions", "title": "Setting up the Action", "content": "With all of the above accomplished, you should have a main.py or a main.R file and a requirements.txt file setup in your repository. If you’re using R, you’ll also have an install.R script present. With that, we move to setting up the Github Action! . In this section, we assume that your repository is already on Github. Throughout, we’ll assume that the repository is hosted at USERNAME/REPO, e.g., lost-stats/lost-stats.github.io. Telling it to run . Now you just need to add a file called .github/workflows/schedule.yml to your repo. Its contents should look like this: . name: Run scheduled action on: schedule: # You need to set your schedule here - cron: CRON_SCHEDULE jobs: pull_data: runs-on: ubuntu-20.04 steps: - name: Checkout code uses: actions/checkout@v2 with: persist-credentials: false fetch-depth: 0 # If using Python: - name: Set up Python 3.8 uses: actions/setup-python@v2 with: python-version: \"3.8\" # If using R: - name: Set up R 4.0.3 uses: r-lib/actions/setup-r@v1 with: r-version: \"4.0.3\" # If using Python: - name: Install dependencies run: pip install -r requirements.txt # If using R: - name: Install dependencies run: Rscript --vanilla install.R # If using Python: - name: Pull data run: python3 main.py # If using R: - name: Pull data run: Rscript --vanilla main.R # NOTE: This commits everything in the `data` directory. Make sure this matches your needs - name: Git commit run: | git add data git config --local user.email \"action@github.com\" git config --local user.name \"GitHub Action\" git commit -m \"Commiting data\" # NOTE: Check that your branch name is correct here - name: Git push run: | git push \"https://${GITHUB_ACTOR}:${TOKEN}@github.com/${GITHUB_REPOSITORY}.git\" HEAD:main env: TOKEN: ${{ secrets.GITHUB_TOKEN }} . You’ll need to edit this file and retain only the stanzas that pertain to whether you’re using Python or R. However, you’ll need to make a few adjustments. Let’s go through the file stanza by stanza to explain what it is doing: . name: Run scheduled action . This is just a descriptive name. Everything after the : is decorative. Name it whatever you like! . on: . This section describes when the action should run. Github actions supports several potential events, including push, pull_request, and repository_dispatch. However, since this is a scheduled action, we’re going to use the schedule event. The next line - cron: CRON_SCHEDULE tells Github how frequently to run the action. You need to replace CRON_SCHEDULE with your preferred frequency. You need to write this in “cron syntax,” which is an arcane but pretty universally recognized format for specifying event schedules. I recommend using a helper like this one to write this expression. For instance, let’s say we want to run this job at noon UTC every day. Then this line should become - cron: \"0 12 * * *\". jobs: . This tells us that we’re about to begin specifying the list of jobs to be run on the schedule described above. pull_data: . This is also just a descriptive name. It is best that it follow snake_casing, in particular, it should have no spaces or strange characters. runs-on: ubuntu-20.04 . This specifies which operating system to run your code on. Github supports a lot of choices, but generally, ubuntu-20.04 or ubuntu-latest is what you’ll want. steps: . In what follows, we list out the individual steps Github should take. Each step consists of several components: . | name: A descriptive name. Can be anything you’d like. It’s also optional, but I find it useful. | uses: Optionally reference an series of steps somebody else has already specified. | with: If using uses:, specificy any variables in calling that action. | run: Just simply run a (series of) commands in the shell, one per line. | env: Specify envrionment variables for use in the shell. | . We’ll see several examples of this below. Checkout code . This stanza tells the action to checkout this repository’s code. This will begin basically every Github action you build. Note that it uses: a standard action that is maintained by Github itself. Setup Python or R . These are actions that tell Github to make a specific version of Python or R available in your envrionment. You probably only need one, but you can use both if you need. Specify the exact version you want in the with: section. Install dependencies . This runs a script that installs all the dependencies you enumerated earlier in requirements.txt. Python comes with a built in dependency manager called pip, so we just point it to our list of dependencies. On the other hand, we tell R to execute our dependency installation script install.R. In either case, we’re using run: as we’re telling Github to execute a command in its own shell. Pull data . This is the task we’re actually going to run! Note that we’re calling either the main.py or main.R file we built before. After this is done, we assume there will be a new file in the data/ directory. Git commit . This stanza commits the new data to this repository and sets up the required git variables. Note that here we’re using run: |. In YAML, ending a line with | indicates that all the following lines that are at the same tab depth should be used as a single value. So here, we’re telling Github to run the commands, git add data, git config --local user.email \"action@github.com\", etc in order. Git push . This pushes the commit back up to the repository using git push. Note that if the name of your main branch is not main (for instance, it may be master), you will need to change HEAD:main to whatever your main branch is called (e.g., HEAD:master). Also note that we are setting an environment variable here. Specfically, in the env: section we’re setting the TOKEN environment variable to ${{ secrets.GITHUB_TOKEN }}. This is a a special value that Github generates for each run of your action that allows your action to manipulate its own repository. In this case, it’s allowing it to push a commit back to the central repository. ", "url": "/Other/task_scheduling_with_github_actions.html#setting-up-the-action", "relUrl": "/Other/task_scheduling_with_github_actions.html#setting-up-the-action" - },"734": { + },"735": { "doc": "Task Scheduling with Github Actions", "title": "And that’s all!", "content": "And that’s it! With that file commited, you Github action should run every day at noon UTC. From here, there are a lot of simple extensions to be made and tried. Here are some challenges to make sure you know what’s going on above: . | Instead making the job run every day at noon UTC, make it run on Wednesdays at 4pm UTC. | Instead of returning at tibble, return a data.frame in R. Note that you’ll need to expand the collection of requirements! | Instead of returning a list of lists in Python, return a pandas data frame. Note that you’ll need to expand the collection of requirements! | . ", "url": "/Other/task_scheduling_with_github_actions.html#and-thats-all", "relUrl": "/Other/task_scheduling_with_github_actions.html#and-thats-all" - },"735": { + },"736": { "doc": "Task Scheduling with Github Actions", "title": "One final note: API keys", "content": "A very common need to pull data is some sort of API key. Your cron job will need access to your API key. Conveniently, Github has provided a nice functionality to do exactly this: Secrets. To get your API key to your script, follow these steps: . | Setup your secret according to the above instructions. Let’s give it the name API_KEY for convenience. | Modify your main.py or main.R file to look for the API_KEY environemnt variable. For instance, in Python you might do: | . import os api_key = os.environ.get(\"API_KEY\", \"some_other_way\") . or in R you might do . api_key <- Sys.getenv(\"API_KEY\", unset = \"some_other_way\") . | Amend the Pull data step in your action to set the API_KEY environment variable. For instance, it might look like: | . - name: Pull data run: python3 main.py env: API_KEY: ${{ secrets.API_KEY }} . ", "url": "/Other/task_scheduling_with_github_actions.html#one-final-note-api-keys", "relUrl": "/Other/task_scheduling_with_github_actions.html#one-final-note-api-keys" - },"736": { + },"737": { "doc": "Task Scheduling with Github Actions", "title": "Task Scheduling with Github Actions", "content": "Typically when performing statistical analyses, we write code to be run approximately once. But software more generally is frequently run multiple times. Web servers run constantly, executing the same code over and over in response to user commands. A video game is rerun on demand, each time you turn it on. In statistical analyses, though, if code is to be run multiple times, it often needs to be run on a schedule. For instance, you may want to scrape weather data every hour to build an archive for later analysis. Or perhaps you want to perform the same statistical analyses each week on new data as it comes in. In our experience, this is the worst kind of tasks for humans to do: They have to reliably remember to run a piece of code at a specified time, aggregate the results in a consistent format, and then walk away. One mistimed meeting or baby feeding and it’s likely the reseaercher will forget to hit “go.” . Thankfully, in addition to doing things over and over or on demand, computers are also reasonably good at keeping time. In this article, we’ll describe the role of a task scheduler and demonstrate how to use Github Actions to run a simple data gathering task at regular intervals and commit that data to a repository. ", "url": "/Other/task_scheduling_with_github_actions.html", "relUrl": "/Other/task_scheduling_with_github_actions.html" - },"737": { + },"738": { "doc": "Tobit Regression", "title": "Tobit Regression", "content": "If you have ever encountered data that is censored in some way, then the Tobit method is worth a detailed look. Perhaps the measurement tools only detect at a minimum threashold or up until some maximum threshold, or there’s a physical limitation or natural constraint that cuts off the range of outcomes. If the dependent variable has a limited range in any way, then an OLS regression will capture the relationship between variables with a cluster of zeros or maximums distorting the relationship. James Tobin’s big idea was essentially to modify the likelihood function to represent the unequal sampling probability of observations depending if a latent dependent variable is smaller than or larger than that range. The Tobit model is also called a Censored Regression Model for this reason, as it allows flexility to account of either left or right side censorship. There is flexibility in the mathematics depending on how the censoring occurs. To learn more to match the mathematics/functional form to your practical application, wikipedia has a great page here along with links to outside practical applications. Para Español, dale click en el siguiente enlance aqui. Estas notas tienen las lecciones importantes de esta pagina en Ingles. ", "url": "/Model_Estimation/GLS/tobit.html", "relUrl": "/Model_Estimation/GLS/tobit.html" - },"738": { + },"739": { "doc": "Tobit Regression", "title": "Keep in Mind", "content": ". | Tobit is used with Censored Data, which IS NOT the same as Truncated Data (see next section) | Tobit can produce a kinked relationship after a zero cluster | Tobit can find the correct relationship underneath a maximum cluster | For non-parametric tobit, you will need a CLAD operator (see link in next section) | . ", "url": "/Model_Estimation/GLS/tobit.html#keep-in-mind", "relUrl": "/Model_Estimation/GLS/tobit.html#keep-in-mind" - },"739": { + },"740": { "doc": "Tobit Regression", "title": "Also Consider", "content": ". | If you are new to the concept of limited dependent variables or OLS Regression, click these links. | Deciphering whether data is censored or truncated is important. If all observations are observed in “X” but the true value of “Y” isn’t known outside some range, then it is Censored. At the Chernobyl disaster the radioactive isotope meter only read up until a maximum threshold, all areas (“X”) are observed but the true value of the radioactive level (“Y”) is right censored at a maximum. When there is not a full set of “X” observed, then data is truncated, or in other words, a censored Y value does not get it’s input x observed thus the set {Y,X} is not complete. For more info try these UH slides from Bauer School of Business (they also have relatively easily digestable theory). | The Tobit model type I (the main one people are talking about without specification) is really a morphed maximum likelihood estimation of a probit, more background from those links. | If you find yourself needing non-parametric form, you will need to use a CLAD operator as well as new variance estimation techniques, I recommend Bruce Hansen’s from University of Wisconsin, notes here. | . ", "url": "/Model_Estimation/GLS/tobit.html#also-consider", "relUrl": "/Model_Estimation/GLS/tobit.html#also-consider" - },"740": { + },"741": { "doc": "Tobit Regression", "title": "Implementations", "content": " ", "url": "/Model_Estimation/GLS/tobit.html#implementations", "relUrl": "/Model_Estimation/GLS/tobit.html#implementations" - },"741": { + },"742": { "doc": "Tobit Regression", "title": "R", "content": "We can use the AER package (link) to run a tobit model in R. # install.packages(\"AER\") # Install first if you don't have it yet library(AER) data(\"Affairs\") # Use the \"Affairs\" dataset provided with AER # Aside: this example replicates Table 22.4 in Greene (2003) tob_mod1 = tobit(affairs ~ age + yearsmarried + religiousness + occupation + rating, data = Affairs) summary(tob_mod1) # The default left- and right-hand side limts for the censored dependent variable # are 0 and Inf, respectively. You might want to change these after inspecting your # data. hist(Affairs$affairs tob_mod2 = tobit(affairs ~ age + yearsmarried + religiousness + occupation + rating, data = Affairs, right = 4) # RHS censored now at 4 summary(tob_mod2) . For another example check out M Clark’s Models by Example Page. ", "url": "/Model_Estimation/GLS/tobit.html#r", "relUrl": "/Model_Estimation/GLS/tobit.html#r" - },"742": { + },"743": { "doc": "2x2 Difference in Difference", "title": "2X2 Difference-in-Differences", "content": "Causal inference with cross-sectional data is fundamentally tricky. | People, firms, etc. are different from one another in lots of ways. | Can only get a clean comparison when you have a (quasi-)experimental setup, such as an experiment or an regression discontinuity. | . Difference-in-difference makes use of a treatment that was applied to one group at a given time but not another group. It compares how each of those groups changed over time (comparing them to themselves to eliminate between-group differences) and then compares the treatment group difference to the control group difference (both of which contain the same time gaps, eliminating differences over time). ", "url": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#2x2-difference-in-differences", "relUrl": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#2x2-difference-in-differences" - },"743": { + },"744": { "doc": "2x2 Difference in Difference", "title": "KEEP IN MIND", "content": ". | For Difference-in-differences to work, parallel trends must hold. That is, nothing else should be changing the gap between treated and control states at the same time as the treatment. While it is not a formal test of parallel trends, researchers often look at whether the gap between treated and control states is constant in pre-treatment years. | Suppose in \\(t = 0\\) (“Pre-period”), and \\(t = 1\\) (“Post-period”). We want to estimate \\(\\tau = Post - Pre\\), or \\(Y(post)-Y(pre)= Y(t=1)-Y(t=0)=\\tau\\). | . ", "url": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#keep-in-mind", "relUrl": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#keep-in-mind" - },"744": { + },"745": { "doc": "2x2 Difference in Difference", "title": "ALSO CONSIDER", "content": ". | This page discusses “2x2” difference-in-difference design, meaning there are two groups, and treatment occurs at a single point in time. Many difference-in-difference applications instead use many groups, and treatments that are implemented at different times (a “rollout” design). Traditionally these models have been estimated using fixed effects for group and time period, i.e. “two-way” fixed effects. However, this approach with difference-in-difference can heavily bias results if treatment effects differ across groups, and alternate estimators are preferred. See Goodman-Bacon 2018 and Callaway and Sant’anna 2019. | . ", "url": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#also-consider", "relUrl": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#also-consider" - },"745": { + },"746": { "doc": "2x2 Difference in Difference", "title": "IMPLEMENTATIONS", "content": " ", "url": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#implementations", "relUrl": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#implementations" - },"746": { + },"747": { "doc": "2x2 Difference in Difference", "title": "Python", "content": "# Step 1: Load libraries and import data import pandas as pd import statsmodels.api as sm # for certain versions of jupyter: # %matplotlib inline url = ( \"https://raw.githubusercontent.com/LOST-STATS/LOST-STATS\" \".github.io/master/Model_Estimation/Data/\" \"Two_by_Two_Difference_in_Difference/did_crime.xlsx\" ) df = pd.read_excel(url) # Step 2: indicator variables # whether treatment has occured at all df['after'] = df['year'] >= 2014 # whether it has occurred to this entity df['treatafter'] = df['after'] * df['treat'] # Step 3: # use pandas basic built in plot functionality to get a visual # perspective of our parallel trends assumption ax = df.pivot(index='year', columns='treat', values='murder').plot( figsize=(20, 10), marker='.', markersize=20, title='Murder and Time', xlabel='Year', ylabel='Murder Rate', # to make sure each year is displayed on axis xticks=df['year'].drop_duplicates().sort_values().astype('int') ) # the function returns a matplotlib.pyplot.Axes object # we can use this axis to add additional decoration to our plot ax.axvline(x=2014, color='gray', linestyle='--') # treatment year ax.legend(loc='upper left', title='treat', prop={'size': 20}) # move and label legend # Step 4: # statsmodels has two separate APIs # the original API is more complete both in terms of functionality and documentation X = sm.add_constant(df[['treat', 'treatafter', 'after']].astype('float')) y = df['murder'] sm_fit = sm.OLS(y, X).fit() # the formula API is more familiar for R users # it can be accessed through an alternate constructor bound to each model class smff_fit = sm.OLS.from_formula('murder ~ 1 + treat + treatafter + after', data=df).fit() # it can also be accessed through a separate namespace import statsmodels.formula.api as smf smf_fit = smf.ols('murder ~ 1 + treat + treatafter + after', data=df).fit() # if using jupyter, rich output is displayed without the print function # we should see three identical outputs print(sm_fit.summary()) print(smff_fit.summary()) print(smf_fit.summary()) . ", "url": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#python", "relUrl": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#python" - },"747": { + },"748": { "doc": "2x2 Difference in Difference", "title": "R", "content": "In this case, we need to discover whether legalized marijuana could change the murder rate. Some states legalized marijuana in 2014. So we measure the how the murder rate changes from before 2014 to after between legalized states and states without legalization. Step 1: . | First of all, we need to load Data and Package, we call this data set “DiD”. | . library(tidyverse) library(broom) library(readxl) library(httr) # Download the Excel file from a URL tf <- tempfile(fileext = \".xlsx\") GET( \"https://raw.githubusercontent.com/LOST-STATS/LOST-STATS.github.io/master/Model_Estimation/Data/Two_by_Two_Difference_in_Difference/did_crime.xlsx\", write_disk(tf) ) DiD <- read_excel(tf) . Step 2: . Notice that the data has already been collapsed to the treated-year level. That is, there is one observation of the murder rate for each year for the treated states (all averaged together), and one observation of the murder rate for each year for the untreated states (all averaged together). We create the indicator variable called after to indicate whether it is in the treated period of being after the year of 2014 (1), or the before period of between 2000-2013 (0). The variable treat indicates that the state legalizes marijuana in 2014. Notice that treat = 1 in these states even before 2014. If the year is after 2014 and the state decided to legalize marijuana, the indicator variable “treatafter” is “1” . DiD <- DiD %>% mutate(after = year >= 2014) %>% mutate(treatafter = after*treat) . Step 3: . Then we need to plot the graph to visualize the impact of legalize marijuana on murder rate by using ggplot. mt <- ggplot(DiD,aes(x=year, y=murder, color = treat)) + geom_point(size=3)+geom_line() + geom_vline(xintercept=2014,lty=4) + labs(title=\"Murder and Time\", x=\"Year\", y=\"Murder Rate\") mt . It looks like, before the legalization occurred, murder rates in treated and untreated states were very similar, lending plausibility to the parallel trends assumption. Step 4: . We need to measure the impact of impact of legalize marijuana. If we include treat, after, and treatafter in a regression, the coefficient on treatafter can be interpreted as “how much bigger was the before-after difference for the treated group?” which is the DiD estimate. reg<-lm(murder ~ treat+treatafter+after, data = DiD) summary(reg) . After legalization, the murder rate dropped by 0.3% more in treated than untreated states, suggesting that legalization reduced the murder rate. ", "url": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#r", "relUrl": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html#r" - },"748": { + },"749": { "doc": "2x2 Difference in Difference", "title": "2x2 Difference in Difference", "content": " ", "url": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html", "relUrl": "/Model_Estimation/Research_Design/two_by_two_difference_in_difference.html" - },"749": { + },"750": { "doc": "Home", "title": "Home", "content": "# Home Welcome to the **Library of Statistical Techniques** (LOST)! LOST is a publicly-editable website with the goal of making it easy to execute statistical techniques in statistical software. Each page of the website contains a statistical technique — which may be an estimation method, a data manipulation or cleaning method, a method for presenting or visualizing results, or any of the other kinds of things that statistical software typically does. For each of those techniques, the LOST page will contain code for performing that method in a variety of packages and languages. It may also contain information (or links) with thorough descriptions of the method, but the focus here is on implementation. How can you do it in your language of choice? If there are multiple ways, how are those ways different? Is the way you used to do it outdated, or does it do something unexpected? What's the `R` equivalent of that command you know about in `Stata` or `SAS`, or vice versa? In short, LOST is a Rosetta Stone for statistical software. If you are interested in contributing to LOST, please see the [Contributing](https://lost-stats.github.io/Contributing/Contributing.html) page. LOST was originated in 2019 by Nick Huntington-Klein and is maintained by volunteer contributors. The project's GitHub page is [here](https://github.com/LOST-STATS/lost-stats.github.io). ", diff --git a/feed.xml b/feed.xml index 686a601e..c605cde4 100644 --- a/feed.xml +++ b/feed.xml @@ -1 +1 @@ -Jekyll2024-08-19T18:43:02+00:00/feed.xmlLOST \ No newline at end of file +Jekyll2024-08-22T06:26:19+00:00/feed.xmlLOST \ No newline at end of file