diff --git a/Untitled.ipynb b/Untitled.ipynb deleted file mode 100644 index c865a362b..000000000 --- a/Untitled.ipynb +++ /dev/null @@ -1,244 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "id": "27a9f924-6e0b-4ca5-8bba-a2af2dfbddd8", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1c942d13-fafc-4c2b-b31f-7b0df5cddfb8", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "hello there\n" - ] - } - ], - "source": [ - "print(\"hello there\")" - ] - }, - { - "cell_type": "markdown", - "id": "abf91581-f1ad-4368-bc1a-6e3f8ca0898e", - "metadata": {}, - "source": [ - "---CELL BEGIN---\n", - "\n", - "You can do anything at https://html5zombo.com\n", - "\n", - "---CELL END---" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "59e3e440-45c1-435d-a612-1e149bd86910", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "hello there\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "\n", - "print(\"hello there\")\n", - "\n", - "x = [1, 2, 3, 4, 5]\n", - "y = [1, 4, 9, 16, 25]\n", - "plt.figure(figsize=(8, 6))\n", - "plt.plot(x, y)\n", - "plt.gcf().patch.set_alpha(1.0) # change these values to < 1.0 for transparency in different parts of the graph\n", - "plt.gca().patch.set_alpha(1.0) # change these values to < 1.0 for transparency in different parts of the graph\n", - "plt.show()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "acd43991-a8d7-4983-b51f-b873b30e968b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "img with missing alt tag in markdown\n", - "\n", - "![](https://i5.walmartimages.com/seo/Tzicr-Modern-Velvet-Home-Office-Chair-Upholstered-Chairs-with-360-Swivel-Armchair-and-Gold-Base-for-Living-Room-Bedroom-Vanity-Study-Beige_6536f77a-49dd-44b0-82e0-3baf61fdae01.7e36947377a7a46b6313b3bd412c092e.jpeg)" - ] - }, - { - "cell_type": "markdown", - "id": "c7472a21-64c2-4497-8da9-6346b639da9d", - "metadata": {}, - "source": [ - "local image in markdown with alt tag \n", - "\n", - "![sds](testLocalImage.png)" - ] - }, - { - "cell_type": "markdown", - "id": "5a6c5a8d-df6f-4e16-b719-f2c33bd421ff", - "metadata": {}, - "source": [ - "local transparent image in markdown without alt tag\n", - "\n", - "![h](testLocalImageTransp.png)" - ] - }, - { - "cell_type": "markdown", - "id": "e8a1561d-6a34-4407-8dcd-f88d65bc5649", - "metadata": {}, - "source": [ - "transparent with alt tag html\n", - "\n", - "\"d\"" - ] - }, - { - "cell_type": "markdown", - "id": "75f7b70b-1a03-4d5a-9a88-14dd4872bb4b", - "metadata": {}, - "source": [ - "imgs without alt tag html\n", - "\n", - "![h](testLocalImage.png)\n", - "\n", - "\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "89031092-2561-44b9-9b01-4d69445b2802", - "metadata": {}, - "source": [ - "### heading 3" - ] - }, - { - "cell_type": "markdown", - "id": "a4923e7d-c548-43e8-9af6-75d1bda27314", - "metadata": {}, - "source": [ - "#### heading 4" - ] - }, - { - "cell_type": "markdown", - "id": "1282f139-5580-435e-83fa-e0ec1cea1efd", - "metadata": {}, - "source": [ - "## heading 2\n", - "\n", - "paragraph\n", - "paragraph\n", - "paragraph" - ] - }, - { - "cell_type": "markdown", - "id": "80b58b7d-747e-4275-b5f3-92ddf6ea7e1c", - "metadata": {}, - "source": [ - "#### heading 4\n", - "\n", - "another paragraph" - ] - }, - { - "cell_type": "markdown", - "id": "665980c8-5c82-40b6-8bb0-53308980ff89", - "metadata": {}, - "source": [ - "### heading 3" - ] - }, - { - "cell_type": "markdown", - "id": "3360305c-ad8e-491c-93b4-8d4abf2afe10", - "metadata": {}, - "source": [ - "### heading 3" - ] - }, - { - "cell_type": "markdown", - "id": "a9838dde-9edf-4f19-852f-c0926018922f", - "metadata": {}, - "source": [ - "### heading 3" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/Untitled1.ipynb b/Untitled1.ipynb deleted file mode 100644 index 8c81ae6f0..000000000 --- a/Untitled1.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ceaff4ea-b71d-4046-93ec-c4fa3ae49b79", - "metadata": {}, - "source": [ - "![](testLocalImageTransp.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/lab11.ipynb b/lab11.ipynb deleted file mode 100644 index 2eca02087..000000000 --- a/lab11.ipynb +++ /dev/null @@ -1,1255 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 49, - "id": "1ea205e9", - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "# Initialize Otter\n", - "import otter\n", - "grader = otter.Notebook(\"lab11.ipynb\")" - ] - }, - { - "cell_type": "markdown", - "id": "a3b58c9b", - "metadata": {}, - "source": [ - "# Lab 11: Climate Part 2\n", - "\n", - "Welcome to Lab 11!\n", - "\n", - "This Lab will see you complete your analysis from the previous lab by assessing the impact of drought. \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "da6f4717", - "metadata": {}, - "source": [ - "**Getting help on lab**: Whenever you feel stuck or need some further clarification, find a GSI or tutor, and they'll be happy to help!\n", - "\n", - "Here are the policies for getting full credit:\n", - "\n", - "1. For students enrolled in in-person **Regular Labs**, you will receive lab credit by **attending** lab section and making significant progress on the lab notebook. You **do not** need to submit this assignment to Gradescope.\n", - "\n", - "2. For students enrolled in **Self-Service Lab**, you will receive lab credit by **completing** the notebook, passing the test cases, and **submitting** it to Gradescope by **11:59pm on the due date**.\n", - "\n", - "**Submission**: Once you’re finished, run all cells besides the last one, select File > Save Notebook, and then execute the final cell. The result will contain a zip file that you can use to submit on Gradescope.\n", - "\n", - "Let's begin by setting up the tests and imports by running the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "a7880d74", - "metadata": {}, - "outputs": [], - "source": [ - "# Run this cell to set up the notebook, but please don't change it.\n", - "from datascience import *\n", - "import numpy as np\n", - "\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "plt.style.use('fivethirtyeight')\n", - "np.set_printoptions(legacy='1.13')\n", - "\n", - "import warnings\n", - "warnings.simplefilter('ignore')" - ] - }, - { - "cell_type": "markdown", - "id": "5fc91aeb", - "metadata": { - "tags": [] - }, - "source": [ - "# Part 2: Drought" - ] - }, - { - "cell_type": "markdown", - "id": "634457a3", - "metadata": {}, - "source": [ - "According to the [United States Environmental Protection Agency](https://www.epa.gov/climate-indicators/southwest), \"Large portions of the Southwest have experienced drought conditions since weekly Drought Monitor records began in 2000. For extended periods from 2002 to 2005 and from 2012 to 2020, nearly the entire region was abnormally dry or even drier.\" \n", - "\n", - "Assessing the impact of drought is challenging with just city-level data because so much of the water that people use is transported from elsewhere, but we'll explore the data we have and see what we can learn.\n", - "\n", - "Let's first take a look at the precipitation data in the Southwest region. The `southwest.csv` file contains total annual precipitation for 13 cities in the southwestern United States for each year from 1960 to 2021. This dataset is aggregated from the daily data and includes only the Southwest cities from the original dataset that have consistent precipitation records back to 1960." - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "d51fa782", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
City Year Total Precipitation
Albuquerque 1960 8.12
Albuquerque 1961 8.87
Albuquerque 1962 5.39
Albuquerque 1963 7.47
Albuquerque 1964 7.44
\n", - "

... (788 rows omitted)

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "southwest = Table.read_table('southwest.csv')\n", - "southwest.show(5)" - ] - }, - { - "cell_type": "markdown", - "id": "dbed1948", - "metadata": {}, - "source": [ - "**Question 2.1.** Create a table `totals` that has one row for each year in chronological order. It should contain the following columns:\n", - "1. `\"Year\"`: The year (a number)\n", - "2. `\"Precipitation\"`: The total precipitation in all 13 southwestern cities that year\n" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "7c4a3315", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Year Precipitation
1960 149.58
1961 134.82
1962 130.41
1963 132.18
1964 123.41
1965 187.53
1966 120.27
1967 179.02
1968 136.25
1969 191.72
\n", - "

... (51 rows omitted)

" - ], - "text/plain": [ - "Year | Precipitation\n", - "1960 | 149.58\n", - "1961 | 134.82\n", - "1962 | 130.41\n", - "1963 | 132.18\n", - "1964 | 123.41\n", - "1965 | 187.53\n", - "1966 | 120.27\n", - "1967 | 179.02\n", - "1968 | 136.25\n", - "1969 | 191.72\n", - "... (51 rows omitted)" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "totals = southwest.select(\"Year\", \"Total Precipitation\").group(\"Year\", sum).relabeled(\"Total Precipitation sum\", \"Precipitation\")\n", - "totals" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "2906dae7", - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "

q2_1
passed! 🌈

" - ], - "text/plain": [ - "q2_1 results: All test cases passed!" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grader.check(\"q2_1\")" - ] - }, - { - "cell_type": "markdown", - "id": "0acc6eea", - "metadata": {}, - "source": [ - "Run the cell below to plot the total precipitation in these cities over time, so that we can try to spot the drought visually. As a reminder, the drought years given by the EPA were (2002-2005) and (2012-2020)." - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "0f6791ec", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Just run this cell\n", - "totals.plot(\"Year\", \"Precipitation\")" - ] - }, - { - "cell_type": "markdown", - "id": "acf26fe5", - "metadata": {}, - "source": [ - "This plot isn't very revealing. Each year has a different amount of precipitation, and there is quite a bit of variability across years, as if each year's precipitation is a random draw from a distribution of possible outcomes. \n", - "\n", - "Could it be that these so-called \"drought conditions\" from 2002-2005 and 2012-2020 can be explained by chance? In other words, could it be that the annual precipitation amounts in the Southwest for these drought years are like **random draws from the same underlying distribution** as for other years? Perhaps nothing about the Earth's precipitation patterns has really changed, and the Southwest U.S. just happened to experience a few dry years close together. \n", - "\n", - "To assess this idea, let's conduct an A/B test in which **each year's total precipitation** is an outcome, and the condition is **whether or not the year is in the EPA's drought period**." - ] - }, - { - "cell_type": "markdown", - "id": "8e18aba6", - "metadata": {}, - "source": [ - "This `drought_label` function distinguishes between drought years as described in the U.S. EPA statement above (2002-2005 and 2012-2020) and other years. Note that the label \"other\" is perhaps misleading, since there were other droughts before 2000, such as the massive [1988 drought](https://en.wikipedia.org/wiki/1988%E2%80%9390_North_American_drought) that affected much of the U.S. However, if we're interested in whether these modern drought periods (2002-2005 and 2012-2020) are *normal* or *abnormal*, it makes sense to distinguish the years in this way. " - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "b8c13e9e", - "metadata": {}, - "outputs": [], - "source": [ - "def drought_label(n):\n", - " \"\"\"Return the label for an input year n.\"\"\"\n", - " if 2002 <= n <= 2005 or 2012 <= n <= 2020:\n", - " return 'drought'\n", - " else:\n", - " return 'other'" - ] - }, - { - "cell_type": "markdown", - "id": "7ddb1cdf", - "metadata": {}, - "source": [ - "\n", - "\n", - "**Question 2.2.** Define null and alternative hypotheses for an A/B test that investigates whether drought years are **drier** (have less precipitation) than other years.\n", - "\n", - "*Note:* Please format your answer using the following structure.\n", - "\n", - "- *Null hypothesis:* ...\n", - "- *Alternative hypothesis:* ...\n", - "\n", - "\n", - "![](\"hello.png\")" - ] - }, - { - "cell_type": "markdown", - "id": "02be5cdd", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "id": "9a76f37d", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "**Question 2.3.** First, define the table `drought`. It should contain one row per year and the following two columns:\n", - "- `\"Label\"`: Denotes if a year is part of a `\"drought\"` year or an `\"other\"` year\n", - "- `\"Precipitation\"`: The sum of the total precipitation in 13 Southwest cities that year\n", - "\n", - "Then, construct an overlaid histogram of two observed distributions: the total precipitation in drought years and the total precipitation in other years. \n", - "\n", - "*Note*: Use the provided `bins` when creating your histogram, and do not re-assign the `southwest` table. Feel free to use as many lines as you need!\n", - "\n", - "*Hint*: The optional `group` argument in a certain function might be helpful!\n" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "01298c8b", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "bins = np.arange(85, 215+1, 13)\n", - "drought = totals.with_column(\"Label\", totals.apply(drought_label, \"Year\"))\n", - "drought\n", - "drought.hist(\"Precipitation\", bins = bins, group = \"Label\")\n", - "# ..." - ] - }, - { - "cell_type": "markdown", - "id": "22ecd05e", - "metadata": {}, - "source": [ - "\n", - "\n", - "Before you continue, inspect the histogram you just created and try to guess the conclusion of the A/B test. Building intuition about the result of hypothesis testing from visualizations is quite useful for data science applications. " - ] - }, - { - "cell_type": "markdown", - "id": "d1f76e26", - "metadata": {}, - "source": [ - "**Question 2.4.** Our next step is to choose a test statistic based on our alternative hypothesis in Question 2.2. Which of the following options are valid choices for the test statistic? Assign `ab_test_stat` to **an array of integers** corresponding to valid choices. Assume averages and totals are taken over the total precipitation sums for each year.\n", - "\n", - "1. The difference between the **total** precipitation in **drought** years and the **total** precipitation in **other** years.\n", - "2. The difference between the **total** precipitation in **others** years and the **total** precipitation in **drought** years.\n", - "3. The **absolute** difference between the **total** precipitation in others years and the **total** precipitation in drought years.\n", - "1. The difference between the **average** precipitation in **drought** years and the **average** precipitation in **other** years.\n", - "2. The difference between the **average** precipitation in **others** years and the **average** precipitation in **drought** years.\n", - "3. The **absolute** difference between the **average** precipitation in others years and the **average** precipitation in drought years.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "a46dcde3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ab_test_stat = make_array(1)" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "1734a0aa", - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "

q2_4
passed! 💯

" - ], - "text/plain": [ - "q2_4 results: All test cases passed!" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grader.check(\"q2_4\")" - ] - }, - { - "cell_type": "markdown", - "id": "11a5a872", - "metadata": {}, - "source": [ - "\n", - "\n", - "**Question 2.5.** Fellow climate scientists Will and Nicole point out that there are more **other** years than **drought** years, and so measuring the difference between total precipitation will always favor the **other** years. They conclude that all of the options above involving **total** precipitation are invalid test statistic choices. Do you agree with them? Why or why not?\n" - ] - }, - { - "cell_type": "markdown", - "id": "843314b0", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "id": "846822aa", - "metadata": {}, - "source": [ - "\n", - "\n", - "Before going on, check your `drought` table. It should have two columns `Label` and `Precipitation` with 61 rows, 13 of which are for `\"drought\"` years." - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "c149a181", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Year Precipitation Label
1960 149.58 other
1961 134.82 other
1962 130.41 other
\n", - "

... (58 rows omitted)

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "drought.show(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "eda39f8b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Label count
drought 13
other 48
" - ], - "text/plain": [ - "Label | count\n", - "drought | 13\n", - "other | 48" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "drought.group('Label')" - ] - }, - { - "cell_type": "markdown", - "id": "e7e7556d", - "metadata": {}, - "source": [ - "**Question 2.6.** For our A/B test, we'll use the difference between the average precipitation in drought years and the average precipitation in other years as our test statistic:\n", - "\n", - "$$\\text{average precipitation in \"drought\" years} - \\text{average precipitation in \"other\" years}$$\n", - "\n", - "First, complete the function `test_statistic`. It should take in a two-column table `t` with one row per year and two columns:\n", - "- `Label`: the label for that year (either `'drought'` or `'other'`)\n", - "- `Precipitation`: the total precipitation in the 13 Southwest cities that year. \n", - "\n", - "Then, use the function you define to assign `observed_statistic` to the observed test statistic.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "7d76952a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "-15.856714743589748" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def test_statistic(t):\n", - " obs = np.average(t.where('Label', are.equal_to('drought')).column('Precipitation')) - np.average(t.where('Label', are.equal_to('other')).column('Precipitation'))\n", - " return obs\n", - "\n", - "observed_statistic = test_statistic(drought)\n", - "observed_statistic" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "05707622", - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "

q2_6
passed! 🚀

" - ], - "text/plain": [ - "q2_6 results: All test cases passed!" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grader.check(\"q2_6\")" - ] - }, - { - "cell_type": "markdown", - "id": "e142edbd", - "metadata": {}, - "source": [ - "Now that we have defined our hypotheses and test statistic, we are ready to conduct our hypothesis test. We’ll start by defining a function to simulate the test statistic under the null hypothesis, and then call that function 5,000 times to construct an empirical distribution under the null hypothesis." - ] - }, - { - "cell_type": "markdown", - "id": "40bb3df2", - "metadata": {}, - "source": [ - "**Question 2.7.** Write a function to simulate the test statistic under the null hypothesis. The `simulate_precipitation_null` function should simulate the null hypothesis once (not 5,000 times) and return the value of the test statistic for that simulated sample.\n", - "\n", - "*Hint*: Using `t.with_column(...)` with a column name that already exists in a table `t` will replace that column with the newly specified values.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "d5ead0ae", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9.4788141025641153" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def simulate_precipitation_null():\n", - " sampled = drought.sample(with_replacement = False).column(\"Label\")\n", - " sampledTable = drought.drop(\"Label\").with_column(\"Label\", sampled)\n", - " stat = test_statistic(sampledTable)\n", - " return stat\n", - "\n", - "\n", - "# Run your function a couple times to make sure that it works\n", - "simulate_precipitation_null()" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "0d0c3970", - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "

q2_7
passed! ✨

" - ], - "text/plain": [ - "q2_7 results: All test cases passed!" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grader.check(\"q2_7\")" - ] - }, - { - "cell_type": "markdown", - "id": "e975b101", - "metadata": {}, - "source": [ - "**Question 2.8.** Fill in the blanks below to complete the simulation for the hypothesis test. Your simulation should compute 5,000 values of the test statistic under the null hypothesis and store the result in the array `sampled_stats`.\n", - "\n", - "*Hint:* You should use the `simulate_precipitation_null` function you wrote in the previous question!\n", - "\n", - "*Note:* Running this cell may take a few seconds. If it takes more than a minute, try to find a different (faster) way to implement your `simulate_precipitation_null` function.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "ab66f8ab", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "sampled_stats = make_array()\n", - "\n", - "repetitions = 5000\n", - "for i in np.arange(repetitions):\n", - " sampled_stats = np.append(sampled_stats, simulate_precipitation_null())\n", - "\n", - "# Do not change these lines\n", - "Table().with_column('Difference Between Means', sampled_stats).hist()\n", - "plt.scatter(observed_statistic, 0, c=\"r\", s=50);\n", - "plt.ylim(-0.01);" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "241e0bd5", - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "

q2_8
passed! ✨

" - ], - "text/plain": [ - "q2_8 results: All test cases passed!" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grader.check(\"q2_8\")" - ] - }, - { - "cell_type": "markdown", - "id": "5a70d827", - "metadata": {}, - "source": [ - "**Question 2.9.** Compute the p-value for this hypothesis test, and assign it to the variable `precipitation_p_val`.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "76ddf1da", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.5" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "precipitation_p_val = 0.5\n", - "precipitation_p_val" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "4bf94c97", - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "

q2_9
passed! 🙌

" - ], - "text/plain": [ - "q2_9 results: All test cases passed!" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grader.check(\"q2_9\")" - ] - }, - { - "cell_type": "markdown", - "id": "95088247", - "metadata": {}, - "source": [ - "\n", - "\n", - "**Question 2.10.** State a conclusion from this test using a p-value cutoff of 5%. What have you learned about the EPA's statement on drought?\n" - ] - }, - { - "cell_type": "markdown", - "id": "7df7efbd", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "id": "2eca4c61", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "**Question 2.11.** Does your conclusion from Question 2.10 apply to the entire Southwest region of the U.S.? Why or why not?\n" - ] - }, - { - "cell_type": "markdown", - "id": "8aaeb116", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "id": "39039658", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Conclusion" - ] - }, - { - "cell_type": "markdown", - "id": "73cd05b0", - "metadata": {}, - "source": [ - "Data science plays a central role in climate change research because massive simulations of the Earth's climate are necessary to assess the implications of climate data recorded from weather stations, satellites, and other sensors. [Berkeley Earth](http://berkeleyearth.org/data/) is a common source of data for these kinds of projects.\n", - "\n", - "In this project, we found ways to apply our statistical inference technqiues that rely on random sampling even in situations where the data were not generated randomly, but instead by some complicated natural process that appeared random. We made assumptions about randomness and then came to conclusions based on those assumptions. Great care must be taken to choose assumptions that are realistic, so that the resulting conclusions are not misleading. However, making assumptions about data can be productive when doing so allows inference techniques to apply to novel situations." - ] - }, - { - "cell_type": "markdown", - "id": "f9218555", - "metadata": {}, - "source": [ - "\"Picture\n", - "\n", - "**Congratulations -- Lily says you're done with the lab!**\n" - ] - }, - { - "cell_type": "markdown", - "id": "936bb88e", - "metadata": {}, - "source": [ - "\n", - "---\n", - "\n", - "You're done with lab!\n", - "\n", - "**Important submission information:** (Self Service Lab Only)\n", - "- **Run all the tests** and verify that they all pass\n", - "- **Save** from the **File** menu\n", - "- **Run the final cell to generate the zip file**\n", - "- **Click the link to download the zip file**\n", - "- Then, go to [Gradescope](https://www.gradescope.com/courses/546043) and submit the zip file to the corresponding assignment. The name of this assignment is \"Lab XX Autograder\", where XX is the lab number -- 01, 02, 03, etc.\n", - "\n", - "- If you finish early in Regular Lab, **ask one of the staff members to check you off**.\n", - "\n", - "**It is your responsibility to make sure your work is saved before running the last cell.**\n" - ] - }, - { - "cell_type": "markdown", - "id": "77da1b60", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "## Submission\n", - "\n", - "Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output. The cell below will generate a zip file for you to submit. **Please save before exporting!**" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "ed4603ff", - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running your submission against local test cases...\n", - "\n", - "\n", - "\n", - "Your submission received the following results when run against available test cases:\n", - "\n", - " q2_1 results: All test cases passed!\n", - "\n", - " q2_4 results: All test cases passed!\n", - "\n", - " q2_6 results: All test cases passed!\n", - "\n", - " q2_7 results: All test cases passed!\n", - "\n", - " q2_8 results:\n", - " q2_8 - 1 result:\n", - " ❌ Test case failed\n", - " Trying:\n", - " len(sampled_stats) == 5000\n", - " Expecting:\n", - " True\n", - " **********************************************************************\n", - " Line 1, in q2_8 0\n", - " Failed example:\n", - " len(sampled_stats) == 5000\n", - " Exception raised:\n", - " Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.9/doctest.py\", line 1334, in __run\n", - " exec(compile(example.source, filename, \"single\",\n", - " File \"\", line 1, in \n", - " len(sampled_stats) == 5000\n", - " NameError: name 'sampled_stats' is not defined\n", - "\n", - " q2_8 - 2 result:\n", - " ❌ Test case failed\n", - " Trying:\n", - " np.std(sampled_stats) > 0\n", - " Expecting:\n", - " True\n", - " **********************************************************************\n", - " Line 1, in q2_8 1\n", - " Failed example:\n", - " np.std(sampled_stats) > 0\n", - " Exception raised:\n", - " Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.9/doctest.py\", line 1334, in __run\n", - " exec(compile(example.source, filename, \"single\",\n", - " File \"\", line 1, in \n", - " np.std(sampled_stats) > 0\n", - " NameError: name 'sampled_stats' is not defined\n", - "\n", - " q2_9 results:\n", - " q2_9 - 1 result:\n", - " ❌ Test case failed\n", - " Trying:\n", - " type(precipitation_p_val) in set([float, np.float32, np.float64])\n", - " Expecting:\n", - " True\n", - " **********************************************************************\n", - " Line 1, in q2_9 0\n", - " Failed example:\n", - " type(precipitation_p_val) in set([float, np.float32, np.float64])\n", - " Expected:\n", - " True\n", - " Got:\n", - " False\n", - "\n", - " q2_9 - 2 result:\n", - " ❌ Test case failed\n", - " Trying:\n", - " 0 <= precipitation_p_val <= 1\n", - " Expecting:\n", - " True\n", - " **********************************************************************\n", - " Line 1, in q2_9 1\n", - " Failed example:\n", - " 0 <= precipitation_p_val <= 1\n", - " Exception raised:\n", - " Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.9/doctest.py\", line 1334, in __run\n", - " exec(compile(example.source, filename, \"single\",\n", - " File \"\", line 1, in \n", - " 0 <= precipitation_p_val <= 1\n", - " TypeError: '<=' not supported between instances of 'int' and 'ellipsis'\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "

Your submission has been exported. Click here\n", - " to download the zip file.

\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Save your notebook first, then run this cell to export your submission.\n", - "grader.export(pdf=False, run_tests=True)" - ] - }, - { - "cell_type": "markdown", - "id": "733f7bdd", - "metadata": {}, - "source": [ - " " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - }, - "otter": { - "OK_FORMAT": true, - "tests": { - "q2_1": { - "name": "q2_1", - "points": [ - 0, - 1 - ], - "suites": [ - { - "cases": [ - { - "code": ">>> totals.num_rows == 61\nTrue", - "hidden": false, - "locked": false - }, - { - "code": ">>> totals.labels == ('Year', 'Precipitation')\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q2_4": { - "name": "q2_4", - "points": [ - 0, - 0, - 2 - ], - "suites": [ - { - "cases": [ - { - "code": ">>> type(ab_test_stat) == np.ndarray\nTrue", - "hidden": false, - "locked": false - }, - { - "code": ">>> all([1 <= option <= 6 for option in ab_test_stat])\nTrue", - "hidden": false, - "locked": false - }, - { - "code": ">>> all([type(option) in set([np.int64, np.int32, int]) for option in list(ab_test_stat)])\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q2_6": { - "name": "q2_6", - "points": [ - 1 - ], - "suites": [ - { - "cases": [ - { - "code": ">>> observed_statistic < 0\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q2_7": { - "name": "q2_7", - "points": [ - 1 - ], - "suites": [ - { - "cases": [ - { - "code": ">>> -50 < simulate_precipitation_null() < 50\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q2_8": { - "name": "q2_8", - "points": [ - 0, - 0 - ], - "suites": [ - { - "cases": [ - { - "code": ">>> len(sampled_stats) == 5000\nTrue", - "hidden": false, - "locked": false - }, - { - "code": ">>> np.std(sampled_stats) > 0\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q2_9": { - "name": "q2_9", - "points": [ - 0, - 1 - ], - "suites": [ - { - "cases": [ - { - "code": ">>> type(precipitation_p_val) in set([float, np.float32, np.float64])\nTrue", - "hidden": false, - "locked": false - }, - { - "code": ">>> 0 <= precipitation_p_val <= 1\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/lowContrastGraph.ipynb b/lowContrastGraph.ipynb deleted file mode 100644 index e4ebfee2d..000000000 --- a/lowContrastGraph.ipynb +++ /dev/null @@ -1,91 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "1bee8719-9e54-4261-acad-1450e76a9b0d", - "metadata": {}, - "source": [ - "![hi](graphIssues1.png)" - ] - }, - { - "cell_type": "markdown", - "id": "a9dd919b-c788-4898-8f78-bc652ceee089", - "metadata": {}, - "source": [ - "![hi](graphIssues2.jpeg)" - ] - }, - { - "cell_type": "markdown", - "id": "2f18ada6-1ca7-4b2b-9709-2bb79f88eeaf", - "metadata": {}, - "source": [ - "[hi](graphIssues3.png)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "03f2a4c6-8f8c-4cb3-a6ce-3f8b93adc9c0", - "metadata": {}, - "outputs": [], - "source": [ - "# import matplotlib.pyplot as plt\n", - "# import numpy as np\n", - "\n", - "# # Data\n", - "# x = np.linspace(0, 10, 100)\n", - "# y1 = np.sin(x)\n", - "# y2 = np.cos(x)\n", - "\n", - "# # Create plot\n", - "# plt.figure(figsize=(8, 6))\n", - "\n", - "# # Sine curve with poor color contrast and poor transparency\n", - "# plt.plot(x, y1, label='Sine', color='lightgrey', alpha=0.2)\n", - "\n", - "# # Cosine curve with poor color contrast\n", - "# plt.plot(x, y2, label='Cosine', color='yellow')\n", - "\n", - "# # Unstructured header (just a plain text, no structure)\n", - "# plt.text(2, 1, 'A Graph', fontsize=20, color='black')\n", - "\n", - "# # X-axis label\n", - "# plt.xlabel('X-axis')\n", - "\n", - "# # Y-axis label\n", - "# plt.ylabel('Y-axis')\n", - "\n", - "# # No title\n", - "\n", - "# # No legend added\n", - "# # plt.legend()\n", - "\n", - "# # Display the plot\n", - "# plt.show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/proj4.ipynb b/proj4.ipynb deleted file mode 100644 index 05189a696..000000000 --- a/proj4.ipynb +++ /dev/null @@ -1,2232 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Initialize Otter\n", - "import otter\n", - "grader = otter.Notebook(\"proj4.ipynb\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "# Project 4: Mongo \n", - "\n", - "## Due Date: Thursday 11/30, 5:00 PM\n", - "\n", - "In this project, we will be investigating how different database systems handle semi-structured JSON data. In particular, we will be placing emphasis on the use of MongoDB: a database system that stores data in a construct known as documents. These documents are very similar to the JSON objects we've explored in lecture, with a few differences in representation and indexing that we will explore in the following questions. \n", - "\n", - "In this project, we will be working with the **Yelp Academic Dataset** which contains a dataset of `businesses`, `reviews`, and `users`. Due to the limitations of JupyterHub and the Mongo instances we are working with, `reviews` and `users` are truncated to 7500 reviews and 1000 users. We will be using the full `businesses` dataset, however.\n", - "\n", - "Throughout the course of this project, you should understand what Mongo can (and cannot) do with regards to its documents as a NoSQL datastore and compare and contrast this to other data representation formats such as the relational model." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Logistics & Scoring Breakdown\n", - "\n", - "Please read the submission instructions carefully and double check that your submission is not throwing any errors. Please ensure that public tests pass upon submission. It is your responsibility to wait until the autograder finishes running. We will not be accepting regrade requests for submission issues.\n", - "\n", - "Each coding question has **both public tests and hidden tests**. Roughly 50% of your coding grade will be made up of your score on the public tests released to you, while the remaining 50% will be made up of unreleased hidden tests. **Free-response questions (marked 'm' in the table below) are manually graded.**\n", - "\n", - "This is an **individual project**. However, you’re welcome to collaborate with any other student in the class as long as it’s within the academic honesty guidelines.\n", - "\n", - "Question | Points\n", - "--- | ---\n", - "1a\t| 1\n", - "1b | 1\n", - "1c\t| 2\n", - "1d\t| 1\n", - "1e\t| 2\n", - "1f | 1\n", - "2a\t| m: 2\n", - "2b\t| 1\n", - "2c | 1\n", - "2d | m: 2\n", - "3a\t| m: 1\n", - "3b\t| 1\n", - "3c\t| 1\n", - "3d | 1\n", - "3e | 1\n", - "3f | 3\n", - "4a\t| 1\n", - "4b\t| 2\n", - "4c\t| 2\n", - "4d | 1\n", - "**Total** | 28\n", - "\n", - "**Grand Total:** 28 points (autograded: 23, manual: 5) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading Up Mongo\n", - "We will be using Pymongo, a Python wrapper for MongoDB, for this project. Every student should have access to their own MongoDB instance, running on the localhost of your Datahub server. After running the following cell, for the rest of the project, you can use the Python variables business, review, and user to access the corresponding collection.\n", - "\n", - "To prevent bracket mismatches while creating your queries, it is recommended to turn on \"Auto Close Brackets\" via Settings in JupyterHub.\n", - "Furthermore, since we are using Python dictionaries as our query filter, make sure to wrap all keys and values inside quotes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import pickle\n", - "import pandas as pd\n", - "import pymongo\n", - "from pymongo import TEXT\n", - "import numpy as np\n", - "\n", - "myclient = pymongo.MongoClient(\"mongodb://localhost\")\n", - "mydb = myclient[\"yelp\"]\n", - "business = mydb[\"business\"]\n", - "review = mydb[\"review\"]\n", - "user = mydb[\"user\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Troubleshooting\n", - "\n", - "**PLEASE READ:** Please avoid printing too much debugging query output—it may crash your Jupyter Hub if your file size becomes too large! It's recommended to use the `limit()` method and delete any debugging query cells if no longer needed as you go through the project.\n", - "\n", - "You might run into issues on the project where you are certain your code works but the output is incorrect. This may be because your collections have been corrupted. Run the following cell and uncomment the specific collections you would like to drop if you would like to remake your collections from scratch. **Be sure to re-run the Load Datasets cells below if you drop your collections so you aren't working with empty collections!**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# UNCOMMENT AND RUN THIS CELL IF YOU WOULD LIKE TO REMAKE YOUR COLLECTIONS FROM SCRATCH. \n", - "# IF YOU DROP ANY COLLECTIONS, RE-RUN THE NEXT TWO CELLS TO LOAD IN THE DATA.\n", - "\n", - "# review.drop()\n", - "# business.drop()\n", - "# user.drop()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Datasets\n", - "The following 2 cells will load the JSON datasets into the appropriate Mongo collections. You will only need to run them once unless you drop the collections above. The second cell **may take a couple of minutes to run** if you are running it for the first time or are running it after you dropped the collections." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import zipfile\n", - "import os.path\n", - "\n", - "if not os.path.isfile('data/yelp_academic_dataset_review.json'):\n", - " with zipfile.ZipFile('data/yelp_academic_dataset_review.json.zip', 'r') as zip_ref:\n", - " zip_ref.extractall('data')\n", - "\n", - "if not os.path.isfile('data/yelp_academic_dataset_user.json'):\n", - " with zipfile.ZipFile('data/yelp_academic_dataset_user.json.zip', 'r') as zip_ref:\n", - " zip_ref.extractall('data')\n", - "\n", - "if not os.path.isfile('data/yelp_academic_dataset_business.json'):\n", - " with zipfile.ZipFile('data/yelp_academic_dataset_business.json.zip', 'r') as zip_ref:\n", - " zip_ref.extractall('data')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# THIS CELL MAY TAKE AT MOST 5 MINUTES. BUT HOPEFULLY YOU WILL ONLY NEED TO RUN IT ONCE.\n", - "import json\n", - "\n", - "if business.count_documents({}) == 0:\n", - " print(\"Loading business collection...\")\n", - " with open('data/yelp_academic_dataset_business.json', encoding='utf-8') as f:\n", - " for line in f:\n", - " business.insert_one(json.loads(line))\n", - "\n", - "if review.count_documents({}) == 0:\n", - " print(\"Loading review collection...\")\n", - " with open('data/yelp_academic_dataset_review.json', encoding='utf-8') as f:\n", - " for line in f:\n", - " review.insert_one(json.loads(line))\n", - " \n", - "if user.count_documents({}) == 0:\n", - " print(\"Loading user collection...\")\n", - " with open('data/yelp_academic_dataset_user.json', encoding='utf-8') as f:\n", - " for line in f:\n", - " user.insert_one(json.loads(line))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a quick look at our collections. For the command below, replace `user` with `review` or `business` to count the number of documents in each collection." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "user.count_documents({})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's inspect our collections. Replace `business` with `review` and `user` to see the first document in each collection." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "business.find_one()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you see a document containing a business named `Oskar Blues Taproom` when you run the command above, it means that our JSON data has successfully been imported into the collection! Now we can get started with exploring Mongo in a bit more detail." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Connect to the grader" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the following cell for grading purposes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Just run the following cell, no further action is needed.\n", - "from data101_utils import GradingUtil\n", - "grading_util = GradingUtil(\"proj4\")\n", - "grading_util.prepare_autograder()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Do not delete/edit this cell\n", - "import pickle\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "
\n", - "\n", - "## Question 1: Basic MQL\n", - "\n", - "### Question 1a\n", - "\n", - "In lecture, we discussed how one could find specific attributes from a JSON object using dot (`.`) notation. \n", - "\n", - "- While you can still use the dot notation in queries, PyMongo represents documents returned from Mongo queries using Python dictionaries, making it convenient to manipulate JSON using a mix of Mongo queries and array indexing. Specifically, given the result of a retrieval `find` query, you can look up the third document by indexing with `[2]`. Note, since we are using Python dictionaries, we will be using 0-based indexing. Then, given this document, you can look up the field `'amount'` by appending `['amount']` etc., adding multiple square brackets as needed to \"walk down\" the JSON tree representation via `collection.find(...)[2]['amount']`. This will return the 'amount' field from the 3rd document returned from the query. This combination of query and indexing will be useful in obtaining the necessary information you need for this question.\n", - " \n", - "- In order to get a visual output of the query results, you will need to wrap `collection.find(...)` inside `list()`, e.g. `list(collection.find(...))`. This is because `collection.find(...)` returns a **Cursor** object, which is an iterator. **An important consequence** is that if we set `result = collection.find(...)`, then calling `list(result)` for the first time will get you the expected list of documents in the query result, but calling `list(result)` for a second time will give you an empty list! So wrapping `collection.find(...)` directly inside `list()` would avoid this issue. With that in mind, you may not *always* need to obtain a visual output of the results.\n", - " \n", - "- Be aware of the distinction of when you are querying with Mongo versus Python-based array indexing into your Mongo query results (i.e. you are wrapping your query inside `list()` and *then* indexing into that list.)\n", - " \n", - "- **As a reminder, since we are using Python dictionaries as our query filter, make sure to wrap all keys and values inside quotes.**\n", - "\n", - "As a warmup to get you familiarized with PyMongo syntax, find the **Tuesday hours** for the restaurant named **Legal Sea Foods** at **100 Huntington Ave** in **Boston**. Be careful—there are many Legal Sea Foods in Boston!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "result_1a = ...\n", - "result_1a" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_1a\", result_1a);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q1a\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "\n", - "### Question 1b\n", - "Now let's get some practice with aggregation and filtering. Our goal is to write a query that computes the average star rating for all businesses in Colorado with 30 reviews or greater. However, this won't be as easy as setting the state to CO! If we inspect this dataset more closely, we will notice that some cities are not matched up with the right states. As an example, run the query below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list(business.find({\"state\": \"CA\"}).limit(3))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice how cities like Portland and Atlanta, and Orlando are classified as California cities! However, the latitude and longitude is generally correct. The latitude of Colorado is between 37 and 41 **inclusive** and the longitude is between -109 and -102 **inclusive**. Now, use this to **find the average star rating** of all businesses in this range with **30 or more reviews**.\n", - "\n", - "Recall that in SQL, we would use a GROUP BY with the AVG aggregation function. In Mongo, we use an aggregation pipeline [(documentation here)](https://www.mongodb.com/docs/manual/reference/method/db.collection.aggregate/), comprised of multiple stages (e.g., `$match` followed by `$group`). Each stage transforms the documents in some way. Pipeline stages do not need to produce one output document for every input document. For example, some stages may generate new documents or filter out documents.\n", - "\n", - "**Hints:**\n", - "- As in the previous question, you may find it helpful to use the PyMongo array notation to extract the pertinent information/document once you have composed the right Mongo aggregation query. You are required to wrap `collection.aggregate(...)` inside `list()`, e.g. `list(collection.aggregate(...))` before indexing / visualizing the output. Similar to `collection.find(...)`, `collection.aggregate(...)` also returns a **Cursor** object (which is an iterator).\n", - "\n", - "- You can set multiple conditions for a given field within the same object, e.g. `{\"$gte\": 0, \"$lte\": 10}`. This is the recommended approach, or else you may need to worry about the ordering between the conditions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "result_1b = ...\n", - "result_1b" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_1b\", result_1b);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q1b\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 1c\n", - "\n", - "In this question, we will explore aggregation and grouping further. We will also make use of the `$project` operator which allows us to output documents with certain fields of our choosing. \n", - "\n", - "For this question, we would like to create an aggregation pipeline to find the town in each state with the highest average number of stars. **We will only consider towns with greater than or equal to 5 reviews in total across all the restaurants in that town so that the average is meaningful.** Your final output should contain exactly two fields:\n", - "- `averageStars` which contains the average number of stars for the corresponding town.\n", - "- `city_state` which is the name of the town with the highest value of average stars in the state concatenated with a comma followed by the state initials\n", - "\n", - "\n", - "To ensure your output is consistent with the autograder, **sort in descending order by `averageStars` and break ties by sorting second on `city_state` in alphabetical (ascending) order.**\n", - "\n", - "As a concrete example, imagine that Berkeley and Austin have the highest average stars in California and Texas respectively (and both have more than or equal to 5 total reviews in this *truncated* dataset). If Berkeley and Austin both have an average star rating of 5.0, your final output should be:\n", - "\n", - "```\n", - "{'averageStars': 5.0, 'city_state': 'Austin, TX'}\n", - "{'averageStars': 5.0, 'city_state': 'Berkeley, CA'}\n", - "```\n", - "\n", - "**Note:** You will provide a pipeline to `business.aggregate(...)` as your solution. Save your pipeline to `q1c_pipeline`.\n", - "\n", - "**Hint:** You may find the `concat` operator helpful [(documentation here)](https://docs.mongodb.com/manual/reference/operator/aggregation/concat/)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "q1c_pipeline = ...\n", - "\n", - "result_1c = list(business.aggregate(q1c_pipeline))\n", - "result_1c" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_1c\", list(business.aggregate(q1c_pipeline)));" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q1c\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 1d\n", - "\n", - "In class, we've described structured (rectangular) data as well as semi-structured data. We haven't quite covered unstructured data—this is basically free-form text. Often, in semi-structured JSON you may have unstructured text data embedded within, such as the text field in the review collection.\n", - "\n", - "MongoDB allows us to build a so-called **text index** to retrieve the relevant document based on keywords found in text in a predefined field. This index converts our free-form text into a structure that allows us to easily look up documents by its contents. To leverage this text search capability, we build a text index on the `text` field in the `review` collection. This has been done for you.\n", - "\n", - "We will then use this text index to do basic sentiment analysis and find all the restaurants we should avoid! Using the text index given, write a query to find all the reviews with \"disgusting\", \"horrible\", \"horrid\", \"gross\", \"bad\", or \"hate\". To use the text index, use the keywords `$text` and `$search` as detailed [here](https://www.mongodb.com/docs/manual/core/text-search-operators/).\n", - "\n", - "Fill in your query into `result_1d` to count how many reviews contain any of these 6 words.\n", - "\n", - "**Hint:** In general, you can count the number of documents returned by a `find` query result via `len(list(collection.find(...)))` or more simply `collection.count_documents(...)`. To count the number of documents returned by an `aggregate` query result, the best way is to directly use `len(list(collection.aggregate(...)))`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# We create a text index here\n", - "if 'text_text' not in review.index_information():\n", - " review.create_index([('text', TEXT)])\n", - "\n", - "result_1d = ...\n", - "result_1d" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_1d\", result_1d);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q1d\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 1e\n", - "\n", - "Now let's learn Mongo updates, deletions, and creation. Create a new collection called `review_boolean` which is the exact same as `reviews` EXCEPT there is a new field called `to_avoid` which is the string \"true\" if the review `text` contains the words \"disgusting\", \"horrid\", \"horrible\", \"gross\", \"bad\", or \"hate\" and the string \"false\" if not. \n", - "\n", - "This is a tricky task! We have not discussed creation, updates, or insertions in great detail during lecture but luckily, Mongo uses a similar approach to SQL.\n", - "\n", - "***Insertions***: In order to insert into a document, you may use the functions [review_boolean.insert_one(...)](https://docs.mongodb.com/manual/reference/method/db.collection.insertOne/) or [review_boolean.insert_many(...)](https://docs.mongodb.com/manual/reference/method/db.collection.insertMany/). These functions take in a document or a list of documents and inserts them into the collection. \n", - "\n", - "***Updates***: In order to update a document, you may use the functions [review_boolean.update_one(...)](https://docs.mongodb.com/manual/reference/method/db.collection.updateOne/) or [review_boolean.update_many(...)](https://docs.mongodb.com/manual/reference/method/db.collection.updateMany/). These functions take in two parameters. The first specifies which documents should be modified. If the first parameter is `{}`, this indicates that all documents should be updated. However, you can put a more specific filter here if you would like. The second parameter specifies what you would like to update your field to (the [$set](https://docs.mongodb.com/manual/reference/operator/update/set/) operator may come in handy here). Recall that in our SQL model, updates are performed as `UPDATE ... SET ... WHERE ...`. In our case, the first ellipsis corresponds to `review_boolean`, the second ellipsis corresponds to the second parameter of `update_*` where `*` can be `one` or `many`, and the third ellipsis corresponds to the first parameter of `update_*`.\n", - "\n", - "***Creation***: We handle creation of the collection for you. But in Pymongo, creation of a collection is as simple as writing `variable_name = db[collection_name]` where db is the the Pymongo database object variable you have already created.\n", - "\n", - "Some additional reminders and hints:\n", - "- The empty collection `review_boolean` has already been created for you and is stored in the variable of the same name.\n", - "- A text index has been created for you. You can use a similar search approach as the last question.\n", - "- We want to start by inserting the documents from the `review` collection into the `review_boolean` collection.\n", - "- Don't forget that in order to pass the hidden tests, the `to_avoid` field must exist for every document in `review_boolean`! The [$exists](https://www.mongodb.com/docs/manual/reference/operator/query/exists/) operator may be helpful." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "review_boolean = mydb[\"review_boolean\"]\n", - "review_boolean.drop()\n", - "\n", - "# We create a text index here\n", - "if 'text_text' not in review_boolean.index_information():\n", - " review_boolean.create_index([('text', TEXT)])\n", - "\n", - "# YOUR ANSWER BEGINS HERE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "review_boolean = mydb[\"review_boolean\"]\n", - "review_boolean.find_one()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "review_boolean = mydb[\"review_boolean\"]\n", - "grading_util.save_results(\"result_1e\", list(review_boolean.find({}, {'_id': 0})));" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q1e\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 1f\n", - "\n", - "Now, you had a change of heart: you decide that it's unfair to label restaurants as `to_avoid` without at least giving them a chance! Remove the `to_avoid` field from the `review_boolean` collection. Calculate the `difference` between the data size of `review_boolean` with the `to_avoid` field and without it. The code for making this calculation is provided but it is up to you to actually remove the field.\n", - "\n", - "*Deletions*: Deletions in Mongo make use of the `review_boolean.update_one(...)` or `review_boolean.update_many(...)` functionality discussed in Question 1e. However, this time, instead of using the `$set` operator which allows for the creation of new fields, we will use the [$unset](https://docs.mongodb.com/manual/reference/operator/update/unset/) operator which deletes them! Very tidy!\n", - "\n", - "**Before running the next cell, make sure to re-run your cell for 1e so you don't get a difference of 0!**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "with_avoid = mydb.command(\"collstats\", \"review_boolean\")['size']\n", - "\n", - "# YOUR ANSWER BEGINS HERE\n", - "# END\n", - "\n", - "without_avoid = mydb.command(\"collstats\", \"review_boolean\")['size']\n", - "difference = with_avoid - without_avoid\n", - "difference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_1f\", difference);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q1f\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "
\n", - "\n", - "#### Question 2: JSON and Relational Models\n", - "\n", - "### Question 2a\n", - "\n", - "Now we have a good idea of how to do retrieval, aggregation, and updates in Mongo. But we haven't talked about why we\n", - "would want to use Mongo to store JSON! In order to explore this, let's take another look at the `business`\n", - "collection. We will look at the first two entries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list(business.find({}).limit(2))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "What are **two** benefts of storing this data in MongoDB with JSON over a relational database management system such as Postgres?\n", - "Please reference specific examples from the `business` collection to back up your claims. \n", - "- Format your answer as follows:\n", - " 1. Benefit #1, Example #1.\n", - " 2. Benefit #2, Example #2.\n", - "\n", - "**Limit each benefit to 1 sentence and each example to 1 sentence for a total of at most four sentences.**\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "
\n", - "\n", - "---\n", - "### Question 2b\n", - "\n", - "It seems like MongoDB is getting all the love when it comes to JSON support! However, modern iterations of relational databases\n", - "such as Postgres 9.3+ also have [excellent JSON functionality](https://www.postgresql.org/docs/9.3/functions-json.html) as we will soon explore in this task. First, let's set up a\n", - "bit of scaffolding. The following cell will import the `yelp_academic_dataset_review.json` data into a table called `reviews` in Postgres yelp database." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%reload_ext sql\n", - "%sql postgresql://jovyan@127.0.0.1:5432/postgres\n", - "\n", - "!psql -h localhost -c 'DROP DATABASE IF EXISTS yelp'\n", - "!psql -h localhost -c 'CREATE DATABASE yelp'\n", - "!psql -h localhost -d yelp -c 'DROP TABLE IF EXISTS reviews'\n", - "!psql -h localhost -d yelp -c 'CREATE TABLE reviews(data TEXT);'\n", - "!cat data/yelp_academic_dataset_review.json | psql -h localhost -d yelp -c \"COPY reviews (data) FROM STDIN;\"\n", - "%sql \\l" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, run the following cell to connect to the Postgres yelp database. There should be no errors after running the following cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%sql postgresql://jovyan@127.0.0.1:5432/yelp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the following cell to observe how this new `reviews` table looks. Note that the `data` column is stored as TEXT and not as JSON." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "SELECT * FROM reviews LIMIT 2;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Observe how the reviews table consists of one column named `data`. This column contains all the JSON documents in the \n", - "reviews collection *in text format*. Use [Postgres' JSON functions](https://www.postgresql.org/docs/9.3/functions-json.html) to write a query that converts the JSON fields into their own `TEXT` columns (**hint:** one of the operators in Table 9-40 may be useful). To be more concrete, your query should contain 8 columns in this particular order: `review_id`, `user_id`, `business_id`, `stars`, `useful`, `funny`, `cool`, and `text`. Each row should correspond to one JSON document. Some skeleton code (that does the mundane work of converting data to JSON properly) is provided to you—you will only need to fill in the SELECT clause." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%sql --save query_2b result_2b <<\n", - "...\n", - "FROM (SELECT CAST(regexp_replace(data, E'[\\\\n\\\\r]+', '','g') AS JSON) AS values FROM reviews) b\n", - "ORDER BY review_id\n", - "LIMIT 10;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "query_2b = %sqlcmd snippets query_2b\n", - "grading_util.save_results(\"result_2b\", query_2b, result_2b)\n", - "result_2b.DataFrame().head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q2b\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 2c\n", - "\n", - "One important aspect of data engineering that we have not referred to yet are joins. We saw, through the use of indices, selection/projection pushdown, and various physical implementations (as well as orderings), joins could be done quite efficiently in relational SQL based databases. How do joins fare in Mongo where the data stored is inherently semistructured? Let's investigate! For this question, we have provided you access to the tables `business_complete` and `review_complete` which contain the business and review collections in relational form as described in 2b (the columns of the relations\n", - "are fields in the JSON document). Each relation has its respective id (`business_id` or `review_id`) column as its primary key." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!psql -h localhost -d yelp -c 'DROP TABLE IF EXISTS business_complete'\n", - "!psql -h localhost -d yelp -c 'CREATE TABLE business_complete(business_id TEXT PRIMARY KEY, name TEXT, address TEXT, city TEXT, state TEXT, postal_code TEXT, latitude TEXT,longitude TEXT, stars TEXT, review_count TEXT, is_open TEXT, attributes TEXT, categories TEXT, hours TEXT);'\n", - "!psql -h localhost -d yelp -c 'DROP TABLE IF EXISTS review_complete'\n", - "!psql -h localhost -d yelp -c 'CREATE TABLE review_complete(review_id TEXT PRIMARY KEY, user_id TEXT, business_id TEXT, stars TEXT, useful TEXT, funny TEXT, cool TEXT,text TEXT);'\n", - "!cat data/business.csv | psql -h localhost -d yelp -c \"COPY business_complete (business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours) FROM STDIN CSV HEADER;\"\n", - "!cat data/review.csv | psql -h localhost -d yelp -c \"COPY review_complete (review_id, user_id, business_id, stars, useful, funny, cool, text) FROM STDIN CSV HEADER;\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at how `review_complete` looks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "SELECT * FROM review_complete LIMIT 1;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At this current moment in time, Mongo only supports left joins (via the lookup aggregation stage). This is what we will compare against SQL.\n", - "\n", - "Let's start by writing a SQL query that displays all the reviews along with their associated business information. You should perform a **left join** between the `review_complete` table and the `business_complete` table on the `business_id` column, and you may project all columns. Keep a mental note of the **execution time** that you see in the query plan." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "result_2c_str = ...\n", - "!psql -h localhost -d yelp -c \"explain analyze $result_2c_str\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "Now, let's perform the equivalent left join in Mongo between `review` and `business`. **The output array field should be named as `business_info`**. Feel free to refer to the `$lookup` [documentation](https://docs.mongodb.com/manual/reference/operator/aggregation/lookup/).\n", - "\n", - "**Note:** You will provide a single-stage pipeline to `review.aggregate(...)` as your solution. Save your pipeline to `q2c_pipeline`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# We first create an index on business_id in the business collection\n", - "business.create_index('business_id', unique=True)\n", - "\n", - "q2c_pipeline = ...\n", - "\n", - "result_2c = list(review.aggregate(q2c_pipeline))[:5]\n", - "# Uncomment the line below to see your output\n", - "# result_2c" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "result_2c = list(review.aggregate(q2c_pipeline))[:5]\n", - "grading_util.save_results(\"result_2c\", result_2c);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q2c\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the following cell to examine the query plan for the Mongo query that you just wrote. Again, make a mental note of the execution time that you see (you can find the value corresponding to the key `executionTimeMillis`)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mydb.command('explain', {'aggregate': 'review', 'pipeline': q2c_pipeline, 'cursor': {}}, verbosity='executionStats')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "
\n", - "\n", - "---\n", - "### Question 2d\n", - "\n", - "In the last question, you performed equivalent left joins in both Postgres and Mongo. Now, examine their query plans, paying special attention to `executionTimeMillis`. Which join was faster? What gives that database system you chose an advantage over the other? Keep your response to at most three sentences." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "
\n", - "
\n", - "\n", - "## Question 3: Dataframes / Pandas\n", - "\n", - "### Question 3a\n", - "\n", - "So far, we've talked about NoSQL / document databases like Mongo and relational databases like Postgres. Now, we will explore data transformation with a different data model: dataframes. Dataframes are similar to relations with some differences as we will dive into here. To that end, we will use Pandas which is a Python package that allows you to work with dataframes. Pandas is widely adopted by data scientists for data loading, wrangling, cleaning, and analysis. To start, let us export our MongoDB collections into Pandas using a function called `json_normalize`. We need to truncate\n", - "`business` before we can use it to meet the memory constraints set by Jupyter. The variable `business_trunc` will contain the reference the truncated business collection." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "business_trunc = mydb[\"business_trunc\"]\n", - "count = 0\n", - "if business_trunc.count_documents({}) != 1000:\n", - " for document in business.find({}):\n", - " count += 1\n", - " business_trunc.insert_one(document)\n", - " if count == 1000:\n", - " break\n", - "\n", - "business_cursor = business_trunc.find({})\n", - "review_cursor = mydb[\"reviews\"].find({})\n", - "user_cursor = mydb[\"users\"].find({})\n", - "\n", - "# Load the collections into Pandas. \n", - "from pandas import json_normalize\n", - "user_df = json_normalize(user_cursor)\n", - "review_df = json_normalize(review_cursor)\n", - "business_df = json_normalize(business_cursor)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For the rest of Question 3, please use the 3 dataframes we just created: `user_df`, `review_df`, and `business_df`. Let's take a look at the first 5 rows of `business_df`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "business_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "What do you notice about how the columns of `business_df` are constructed? How are values that are not found in every document handled in the pandas dataframe? Compare and contrast this dataframe representation with the document representation we saw with Mongo. Keep your response to at most two sentences.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "
\n", - "\n", - "---\n", - "### Question 3b\n", - "\n", - "In the previous question, we talked about how Mongo and Postgres approach joins. Pandas is also capable of performing joins using the [merge()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html) function! For this task, perform a inner join on `business_df` with itself on `stars`. The final dataframe should be saved to a variable called `result_3b` and should only contain 3 columns in this particular order: the name of the first restaurant, the name of the second restaurant, and the number of the stars. The column names can be arbitrary.\n", - "\n", - "**Hint:** Check out [this tutorial](https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html) on selecting a subset of the Dataframe. This will be helpful in the rest of Question 3 as well!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "result_3b = ...\n", - "result_3b" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "result_3b.columns = ['first', 'second', 'stars']\n", - "grading_util.save_results(\"result_3b\", result_3b.sort_values(['first', 'second', 'stars'])[:50]);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3b\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 3c\n", - "\n", - "Due to the nested representation of the data, there are a lot of missing fields with NaN values in the `business_df` dataframe as you may have noticed in 3a. Construct a dataframe `missing_value_df` with two columns: `column_name` and `percent_missing`. `percent_missing` should be the percentage of NaN values in the corresponding column in `business_df`.\n", - "\n", - "**Hint:** use Pandas' [isnull()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isnull.html) function followed by sum()." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "missing_value_df = pd.DataFrame({'column_name': business_df.columns,\n", - " ...\n", - "missing_value_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_3c\", missing_value_df);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3c\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 3d\n", - "\n", - "Plot a histogram distribution of the percentage of NaN values across all columns (via Pandas [hist()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.hist.html) function). Don't worry about putting titles / making it look nice—we won't be grading the plot." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# PLOT HERE" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "Examine the histogram that you just plotted. How many columns are 90%+ NaN? Input your answer into `result_q3d` as an integer (e.g. if your answer is 6, then `result_q3d = 6`)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "result_q3d = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3d\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 3e\n", - "\n", - "Let us now alter `business_df` to exclude the columns with more than 80%+ null values (keep columns with 80% null values or less). This likely means the corresponding attributes are not an important factor for most businesses so we can get rid of them in our `business_df`. Create a new dataframe called `important_attribute_business_df` which only contains these columns.\n", - "\n", - "**Hint:** check out [this section](https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html#how-do-i-select-specific-rows-and-columns-from-a-dataframe) from the tutorial linked in Q3b." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "important_attribute_business_df = ...\n", - "important_attribute_business_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_3e\", important_attribute_business_df);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3e\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 3f\n", - "\n", - "At this point, you have had experience with manipulating data on Mongo, Postgres, and Pandas. In this question, we will provide 3 scenarios and using the lessons you've learned so far, please specify which of the three (Mongo, Postgres, or Pandas) would work best for this specific use case.\n", - "\n", - "1. You are doing a data journalism piece on college sports. You collect a list of colleges and for each collegiate sport program within that college, you find the budget assigned for that program. You have a choice between the following:\n", - "\n", - " A) Representing this data in JSON (e.g. \n", - " ```\n", - " {\n", - " \"UC Berkeley\": {\n", - " \"football\": \"10000000\", \n", - " \"wrestling\": \"344582\", \n", - " ...}\n", - " }\n", - " ```\n", - " ) and importing into Mongo.\n", - " \n", - " B) Representing this data as a schema in Postgres where the columns are the names of the sports.\n", - " \n", - " C) Representing this data as a dataframe in Pandas where the columns are the names of the sports.\n", - "\n", - "You would like to find the aggregate of budgets across different sports (average, sum, median, mode). What would be the best option for storing this data?\n", - "\n", - "**NOTE**: Your answer should look like `q3fi_str = ['A']` or `q3fi_str = ['B']` or `q3fi_str = ['C']`.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "q3fi_str = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3fi\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2. You would now like to investigate what effect does budget have on student-athlete scholarships. After doing some research, you find a dataset that contains a list of every single athlete at every single college and their sport and scholarship levels (this is a massive 10GB+ dataset with millions of rows). You find another dataset that contains a list of colleges, their sports programs, and the program budget. This is another massive dataset with hundreds of thousands of rows. You would like to perform an inner join between the two datasets on school and program so you can view each student-athlete's scholarship with their sport's budget. You have a choice between the following:\n", - "\n", - " A) Representing each dataset in JSON (e.g. \n", - " ```\n", - " {\"athletes\": [\n", - " {\"Chase Garbers\": {\n", - " \"school\": \"UC Berkeley\", \n", - " \"scholarship\": \"full\", \n", - " \"sport\": \"football\", \n", - " ...\n", - " }\n", - " }, \n", - " ...\n", - " ]}\n", - " ```\n", - " and \n", - " ```\n", - " {\"schools\": [\n", - " {\"UC Berkeley\": {\n", - " \"football\": {\n", - " \"budget\": \"10000000\"\n", - " }, \n", - " ...\n", - " }\n", - " }, \n", - " ...\n", - " ]}\n", - " ```\n", - " ), importing into Mongo, and doing a join there.\n", - " \n", - " B) Representing this data as 2 schemas in Postgres where the columns for the first schema are \n", - " [`student_name`, `school`, `sport`, `scholarship`] and for the second [`school`, `sport`, `budget`].\n", - " \n", - " C) Representing this data as 2 dataframes in Pandas with the same columns as Postgres.\n", - "\n", - "What would be the best option for storing this data?\n", - "\n", - "**NOTE**: Your answer should look like `q3fii_str = ['A']` or `q3fii_str = ['B']` or `q3fii_str = ['C']` or `q3fii_str = ['D']`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "q3fii_str = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3fii\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "3. Finally, you are ready to start writing your article! You decide to focus on just the data from UC Berkeley. You have access to a dataset of just UC Berkeley athletes along with their sports and scholarship levels. The scholarship level data was improperly cleaned: some scholarships are recorded as strings \"full\", \"half\", or \"none\" and some are recorded as integer percentages 0-100. You would like to provide this data to your readers in a format that is susceptible to easy visualizations: e.g. graphs that show how many athletes have a full vs. half vs. no scholarship, which sports have the highest percentages of athletes with full scholarships etc. What is the best way to store this data for this purpose?\n", - "\n", - " A) Represent the dataset in JSON e.g.\n", - " ```\n", - " {\"athletes\": [\n", - " {\n", - " \"Chase Garbers\": {\n", - " \"scholarship\": \"full\", \n", - " \"sport\": \"football\"\n", - " }\n", - " },\n", - " {\n", - " \"Danielle Vosk\": {\n", - " \"scholarship\": 25,\n", - " \"sport\": \"basketball\"\n", - " }\n", - " },\n", - " ...\n", - " ]\n", - " }\n", - " ```\n", - " B) Represent this data as a schema in Postgres where the columns are [`student_name`, `sport`, `scholarship`]\n", - " \n", - " C) Represent this data as a dataframe in Pandas with the same columns as Postgres.\n", - " \n", - "**NOTE**: Your answer should look like `q3fiii_str = ['A']` or `q3fiii_str = ['B']` or `q3fiii_str = ['C']` or `q3fiii_str = ['D']`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "q3fiii_str = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3fiii\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "
\n", - "\n", - "## Question 4: Messy JSON\n", - "\n", - "Many of the queries you've seen or written thus far were relatively reliable: aggregating and collecting over fields\n", - "that you know exist for sure. But the nature of Mongo documents is that they are inherently flexible and semi-structured. Not every document will share every single field! In this question, we will explore how Mongo handles these use cases using the `business` collection.\n", - "\n", - "### Question 4a\n", - "\n", - "Imagine you are in charge of managing your family reunion. You would like to book a private room at a restaurant.\n", - "However, you would also like to optimize for chaos. You notice that there is an attribute called `RestaurantsGoodForGroups`. You would like to write a query that returns all restaurants that **do not** have the `RestaurantsGoodForGroups` attribute so that the trajectory of the reunion is determined by fate (**hint:** search up the `$exists` keyword). \n", - "\n", - "How many restaurants do not have the `RestaurantsGoodForGroups` attribute? You may either enter input this as a function with respect to your query or hardcode in either the String or the numeric version of the answer you computed. Ensure that your output for the autograder is the **number of restaurants that do not have the `RestaurantsGoodForGroups` attribute** stored in `q4a_str` as an integer.\n", - "\n", - "**Note:** You would like this list to consist solely of restaurants. This means that the business must have `Restaurants` in the `categories` field. You may perform a similar text search as question 1d. **This holds true for the rest of the Question 4 as well!**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# The following text index may be useful!\n", - "if 'categories_text' not in business.index_information():\n", - " business.create_index([('categories', TEXT)])\n", - "\n", - "...\n", - "q4a_str = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_4a\", q4a_str)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q4a\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 4b\n", - "\n", - "Your relatives inform you that they would like to be at the restaurant when it opens to beat the crowds. Furthermore, after sending\n", - "a when2meet, most of your relatives would prefer for the meal to be on a Friday and the start time of the meal to be \n", - "between 5-6:59PM (17:00-18:59). Find the number of restaurants that open on Fridays between 17:00-18:59 (you only have to consider the opening time!) and store this in a variable labeled `q4b_str`. As a reminder, in order for a business to be a restaurant, it must have `Restaurant` in its categories. Be aware that `hours` can either be an array or `None`!\n", - "\n", - "**Hint**: \n", - "- Set up an aggregation pipeline using the `$set` and `$match` stage operators. You may also want to use the `$split` operator to parse out the Friday hours as an integer and then use comparison operators to find the restaurants that are open during the specified time. Note that using dot notation for array indexing in aggregation pipelines may not work as expected, so we recommend using `$arrayElemAt` operator.\n", - "\n", - "- Again as a reminder, you can set multiple conditions for a given field within the same object, e.g. `{\"$gte\": 0, \"$lte\": 10}`. This is the recommended approach, or else you may need to worry about the ordering between the conditions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "...\n", - "q4b_str = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_4b\", q4b_str)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q4b\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 4c\n", - "\n", - "Some members of your family are vegetarian so you would like to only eat at restaurants with the Vegetarian category. \n", - "However, the `categories` are stored as a single string! You would like to make it easy to access Vegetarian as a separate field. Write a query that does the following: for every category in `categories`, add a new document that contains the `ObjectId` for the previous document (labeled `_id`), the name of the business (labeled `name`), and the category (labeled `category`).\n", - "\n", - "For example, a document \n", - "```\n", - "{\n", - " \"_id\": ObjectId('606ffb0123cf2e5079dbd91f'), \n", - " \"name\": \"Wendy's\", \n", - " ..., \n", - " categories\" : \"Salad, Vegetarian\"\n", - "} \n", - "```\n", - "would become \n", - "```\n", - "{\n", - " \"_id\": ObjectId('606ffb0123cf2e5079dbd91f'), \n", - " \"name\": \"Wendy's\",\n", - " “category”: \"Salad\"\n", - "}\n", - "```\n", - "and \n", - "```\n", - "{\n", - " \"_id\": ObjectId('606ffb0123cf2e5079dbd91f'), \n", - " \"name\": \"Wendy's\",\n", - " “category”: \"Vegetarian\"\n", - "}\n", - "```\n", - "\n", - "Finally, to ensure your output is consistent with the autograder, sort in ascending order by `name` and break ties on `category`. Save your pipeline to a variable called `q4c_pipeline`.\n", - "\n", - "**Hint:** The `$unwind` operator may be helpful here. You can find the documentation [here](https://www.mongodb.com/docs/manual/reference/operator/aggregation/unwind/). Be sure to check what object type `$unwind` operates on and watch out to make sure you don't have any unnecessary space in the `category` field." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "q4c_pipeline = ...\n", - "\n", - "result_4c = list(business.aggregate(q4c_pipeline))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "result_4c[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "result_4c = list(business.aggregate(q4c_pipeline))[10000:10050]\n", - "grading_util.save_results(\"result_4c\", result_4c);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q4c\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "---\n", - "### Question 4d\n", - "This change in representation has made it super easy to view all the vegetarian restaurants and count them without the use of an index since we can now simply filter by whether or not 'Vegetarian' is a field in our document! We have provided some code here to count how many vegetarian restaurants are in our dataset. Simply provide the integer count to get a point for this question :)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "q4d_pipeline = q4c_pipeline[:]\n", - "q4d_pipeline.append({\"$match\": {\"category\": 'Vegetarian'}})\n", - "result_4d = list(business.aggregate(q4d_pipeline))\n", - "\n", - "veg_count = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Do not delete/edit this cell!\n", - "# You must run this cell before running the autograder.\n", - "grading_util.save_results(\"result_4d\", veg_count)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q4d\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "
\n", - "\n", - "## Congratulations! You have finished Project 4.\n", - "\n", - "Run the following cell to zip and download the results of your queries. You will also need to run the export cell at the end of the notebook.\n", - "\n", - "**For submission on Gradescope, you will need to submit the `proj4.zip` file generated by the export cell.** Please ensure that your submission includes `proj4.pdf`. \n", - "\n", - "**Please ensure that public tests pass upon submission.** It is your responsibility to wait until the autograder finishes running. We will not be accepting regrade requests for submission issues.\n", - "\n", - "**Common submission issues:** You MUST submit the generated zip files (not folders) to the autograder. However, Safari is known to automatically unzip files upon downloading. You can fix this by going into Safari preferences, and deselect the box with the text \"Open safe files after downloading\" under the \"General\" tab." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grading_util.prepare_submission_and_cleanup()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "---\n", - "\n", - "To double-check your work, the cell below will rerun all of the autograder tests." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check_all()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "## Submission\n", - "\n", - "Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output. The cell below will generate a zip file for you to submit. **Please save before exporting!**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "# Save your notebook first, then run this cell to export your submission.\n", - "grader.export(files=['results.zip'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - }, - "otter": { - "OK_FORMAT": true, - "tests": { - "q1a": { - "name": "q1a", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> question_1a_str = grading_util.load_results(\"result_1a\")[0]\n>>> \"-\" in question_1a_str and \":\" in question_1a_str\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q1b": { - "name": "q1b", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> question_1b_str = grading_util.load_results(\"result_1b\")[0]\n>>> question_1b_str >= 3 and question_1b_str <= 4.5\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q1c": { - "name": "q1c", - "points": 2, - "suites": [ - { - "cases": [ - { - "code": ">>> cur_test_1c = grading_util.load_results(\"result_1c\")[0]\n>>> len(cur_test_1c) == 31\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q1d": { - "name": "q1d", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> question_1d_str = grading_util.load_results(\"result_1d\")[0]\n>>> question_1d_str >= 700 and question_1d_str <= 800\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q1e": { - "name": "q1e", - "points": 2, - "suites": [ - { - "cases": [ - { - "code": ">>> review_boolean_ans_1e = len(grading_util.load_results(\"result_1e\")[0])\n>>> review_boolean_ans_1e == 7500\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q1f": { - "name": "q1f", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> difference = grading_util.load_results(\"result_1f\")[0]\n>>> 0 < difference <= 300000\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q2b": { - "name": "q2b", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> query_2b, result_2b_df = grading_util.load_results(\"result_2b\")\n>>> result_2b_df.iloc[0, 0] == '000bviMESLXmlIFKDzCEfw'\nTrue", - "hidden": false, - "locked": false - }, - { - "code": ">>> query_2b, result_2b_df = grading_util.load_results(\"result_2b\")\n>>> str(result_2b_df.iloc[8, 3]) == '5.0'\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q2c": { - "name": "q2c", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> cur_test_2c = grading_util.load_results(\"result_2c\")[0]\n>>> 'business_info' in list(cur_test_2c[0].keys())\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q3b": { - "name": "q3b", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> result_3b_df = grading_util.load_results(\"result_3b\")[0]\n>>> result_3b_df.shape == (50, 3)\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q3c": { - "name": "q3c", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> result_3c_df = grading_util.load_results(\"result_3c\")[0]\n>>> list(result_3c_df.loc[result_3c_df['column_name'] == '_id'][\"percent_missing\"])[0] == 0\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q3d": { - "name": "q3d", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> result_q3d >= 1 and result_q3d <= 58\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q3e": { - "name": "q3e", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> result_3e_df = grading_util.load_results(\"result_3e\")[0]\n>>> len(result_3e_df) == 1000 and len(result_3e_df.columns) != 58\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q3fi": { - "name": "q3fi", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> len(q3fi_str) and ord(q3fi_str[0]) >= 65 and ord(q3fi_str[0]) <= 67\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q3fii": { - "name": "q3fii", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> len(q3fii_str) and ord(q3fii_str[0]) >= 65 and ord(q3fii_str[0]) <= 67\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q3fiii": { - "name": "q3fiii", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> len(q3fiii_str) and ord(q3fiii_str[0]) >= 65 and ord(q3fiii_str[0]) <= 67\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q4a": { - "name": "q4a", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> q4a_str = grading_util.load_results(\"result_4a\")[0]\n>>> int(q4a_str) >= 0 and int(q4a_str) <= 10000\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q4b": { - "name": "q4b", - "points": 2, - "suites": [ - { - "cases": [ - { - "code": ">>> q4b_str = grading_util.load_results(\"result_4b\")[0]\n>>> int(q4b_str) >= 3000 and int(q4b_str) <= 4000\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q4c": { - "name": "q4c", - "points": 2, - "suites": [ - { - "cases": [ - { - "code": ">>> cur_test_4c = grading_util.load_results(\"result_4c\")[0]\n>>> cur_test_4c[0]['name'] == 'Asian Fusion Bowl' and cur_test_4c[0]['category'] == 'Street Vendors'\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - }, - "q4d": { - "name": "q4d", - "points": 1, - "suites": [ - { - "cases": [ - { - "code": ">>> veg_count = q4b_str = grading_util.load_results(\"result_4d\")[0]\n>>> veg_count > 0 and veg_count < 2000\nTrue", - "hidden": false, - "locked": false - } - ], - "scored": true, - "setup": "", - "teardown": "", - "type": "doctest" - } - ] - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/DemoAltTags.ipynb b/test_notebooks/DemoAltTags.ipynb similarity index 100% rename from DemoAltTags.ipynb rename to test_notebooks/DemoAltTags.ipynb diff --git a/DemoHeading.ipynb b/test_notebooks/DemoHeading.ipynb similarity index 100% rename from DemoHeading.ipynb rename to test_notebooks/DemoHeading.ipynb diff --git a/DemoTextContrast.ipynb b/test_notebooks/DemoTextContrast.ipynb similarity index 100% rename from DemoTextContrast.ipynb rename to test_notebooks/DemoTextContrast.ipynb diff --git a/DemoTransparency.ipynb b/test_notebooks/DemoTransparency.ipynb similarity index 100% rename from DemoTransparency.ipynb rename to test_notebooks/DemoTransparency.ipynb diff --git a/test_notebooks/Untitled.ipynb b/test_notebooks/Untitled.ipynb index 13aa78b02..c865a362b 100644 --- a/test_notebooks/Untitled.ipynb +++ b/test_notebooks/Untitled.ipynb @@ -4,7 +4,13 @@ "cell_type": "code", "execution_count": 4, "id": "27a9f924-6e0b-4ca5-8bba-a2af2dfbddd8", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", @@ -93,14 +99,7 @@ ] }, { - "cell_type": "markdown", - "id": "6bb9f643-bd59-4fa8-910f-00a3934dfc99", - "metadata": {}, - "source": [ - "![image](https://images.unsplash.com/photo-1604147706283-d7119b5b822c?q=80&w=1000&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxzZWFyY2h8Mnx8d2hpdGUlMjB0ZXh0dXJlfGVufDB8fDB8fHww)" - ] - }, - { + "attachments": {}, "cell_type": "markdown", "id": "acd43991-a8d7-4983-b51f-b873b30e968b", "metadata": { @@ -155,7 +154,7 @@ "\n", "![h](testLocalImage.png)\n", "\n", - "\"h\"" + "\"\"" ] }, { @@ -237,7 +236,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/test_notebooks/Untitled1.ipynb b/test_notebooks/Untitled1.ipynb index f3516412d..8c81ae6f0 100644 --- a/test_notebooks/Untitled1.ipynb +++ b/test_notebooks/Untitled1.ipynb @@ -25,7 +25,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/Untitled2.ipynb b/test_notebooks/Untitled2.ipynb similarity index 100% rename from Untitled2.ipynb rename to test_notebooks/Untitled2.ipynb diff --git a/Untitled3.ipynb b/test_notebooks/Untitled3.ipynb similarity index 100% rename from Untitled3.ipynb rename to test_notebooks/Untitled3.ipynb diff --git a/test_notebooks/lowContrastGraph.ipynb b/test_notebooks/lowContrastGraph.ipynb index 23e55bf5e..e4ebfee2d 100644 --- a/test_notebooks/lowContrastGraph.ipynb +++ b/test_notebooks/lowContrastGraph.ipynb @@ -29,52 +29,41 @@ "execution_count": 1, "id": "03f2a4c6-8f8c-4cb3-a6ce-3f8b93adc9c0", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", + "# import matplotlib.pyplot as plt\n", + "# import numpy as np\n", "\n", - "# Data\n", - "x = np.linspace(0, 10, 100)\n", - "y1 = np.sin(x)\n", - "y2 = np.cos(x)\n", + "# # Data\n", + "# x = np.linspace(0, 10, 100)\n", + "# y1 = np.sin(x)\n", + "# y2 = np.cos(x)\n", "\n", - "# Create plot\n", - "plt.figure(figsize=(8, 6))\n", + "# # Create plot\n", + "# plt.figure(figsize=(8, 6))\n", "\n", - "# Sine curve with poor color contrast and poor transparency\n", - "plt.plot(x, y1, label='Sine', color='lightgrey', alpha=0.2)\n", + "# # Sine curve with poor color contrast and poor transparency\n", + "# plt.plot(x, y1, label='Sine', color='lightgrey', alpha=0.2)\n", "\n", - "# Cosine curve with poor color contrast\n", - "plt.plot(x, y2, label='Cosine', color='yellow')\n", + "# # Cosine curve with poor color contrast\n", + "# plt.plot(x, y2, label='Cosine', color='yellow')\n", "\n", - "# Unstructured header (just a plain text, no structure)\n", - "plt.text(2, 1, 'A Graph', fontsize=20, color='grey')\n", + "# # Unstructured header (just a plain text, no structure)\n", + "# plt.text(2, 1, 'A Graph', fontsize=20, color='black')\n", "\n", - "# X-axis label\n", - "plt.xlabel('X-axis')\n", + "# # X-axis label\n", + "# plt.xlabel('X-axis')\n", "\n", - "# Y-axis label\n", - "plt.ylabel('Y-axis')\n", + "# # Y-axis label\n", + "# plt.ylabel('Y-axis')\n", "\n", - "# No title\n", + "# # No title\n", "\n", - "# No legend added\n", - "# plt.legend()\n", + "# # No legend added\n", + "# # plt.legend()\n", "\n", - "# Display the plot\n", - "plt.show()" + "# # Display the plot\n", + "# plt.show()" ] } ], @@ -94,7 +83,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/test_notebooks/proj4.ipynb b/test_notebooks/proj4.ipynb index 7a91a3b56..05189a696 100644 --- a/test_notebooks/proj4.ipynb +++ b/test_notebooks/proj4.ipynb @@ -5,7 +5,11 @@ "execution_count": null, "metadata": { "deletable": false, - "editable": false + "editable": false, + "slideshow": { + "slide_type": "" + }, + "tags": [] }, "outputs": [], "source": [ @@ -16,7 +20,13 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "source": [ "# Project 4: Mongo \n", "\n", diff --git a/projA1.ipynb b/test_notebooks/projA1.ipynb similarity index 100% rename from projA1.ipynb rename to test_notebooks/projA1.ipynb