04_scraping_notebook.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
  <meta charset="utf-8">
  <meta name="generator" content="quarto-0.9.165">
  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
  <title>Web scraping</title>
  <style>
    code{white-space: pre-wrap;}
    span.smallcaps{font-variant: small-caps;}
    span.underline{text-decoration: underline;}
    div.column{display: inline-block; vertical-align: top; width: 50%;}
    div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
    ul.task-list{list-style: none;}
    pre > code.sourceCode { white-space: pre; position: relative; }
    pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
    pre > code.sourceCode > span:empty { height: 1.2em; }
    .sourceCode { overflow: visible; }
    code.sourceCode > span { color: inherit; text-decoration: inherit; }
    div.sourceCode { margin: 1em 0; }
    pre.sourceCode { margin: 0; }
    @media screen {
    div.sourceCode { overflow: auto; }
    }
    @media print {
    pre > code.sourceCode { white-space: pre-wrap; }
    pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
    }
    pre.numberSource code
      { counter-reset: source-line 0; }
    pre.numberSource code > span
      { position: relative; left: -4em; counter-increment: source-line; }
    pre.numberSource code > span > a:first-child::before
      { content: counter(source-line);
        position: relative; left: -1em; text-align: right; vertical-align: baseline;
        border: none; display: inline-block;
        -webkit-touch-callout: none; -webkit-user-select: none;
        -khtml-user-select: none; -moz-user-select: none;
        -ms-user-select: none; user-select: none;
        padding: 0 4px; width: 4em;
        color: #aaaaaa;
      }
    pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
    div.sourceCode
      {   }
    @media screen {
    pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
    }
    code span.al { color: #ff0000; font-weight: bold; } /* Alert */
    code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
    code span.at { color: #7d9029; } /* Attribute */
    code span.bn { color: #40a070; } /* BaseN */
    code span.bu { } /* BuiltIn */
    code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
    code span.ch { color: #4070a0; } /* Char */
    code span.cn { color: #880000; } /* Constant */
    code span.co { color: #60a0b0; font-style: italic; } /* Comment */
    code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
    code span.do { color: #ba2121; font-style: italic; } /* Documentation */
    code span.dt { color: #902000; } /* DataType */
    code span.dv { color: #40a070; } /* DecVal */
    code span.er { color: #ff0000; font-weight: bold; } /* Error */
    code span.ex { } /* Extension */
    code span.fl { color: #40a070; } /* Float */
    code span.fu { color: #06287e; } /* Function */
    code span.im { } /* Import */
    code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
    code span.kw { color: #007020; font-weight: bold; } /* Keyword */
    code span.op { color: #666666; } /* Operator */
    code span.ot { color: #007020; } /* Other */
    code span.pp { color: #bc7a00; } /* Preprocessor */
    code span.sc { color: #4070a0; } /* SpecialChar */
    code span.ss { color: #bb6688; } /* SpecialString */
    code span.st { color: #4070a0; } /* String */
    code span.va { color: #19177c; } /* Variable */
    code span.vs { color: #4070a0; } /* VerbatimString */
    code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
  </style>

  <script src="04_scraping_notebook_files/libs/clipboard/clipboard.min.js"></script>
  <script src="04_scraping_notebook_files/libs/quarto-html/quarto.js"></script>
  <script src="04_scraping_notebook_files/libs/quarto-html/popper.min.js"></script>
  <script src="04_scraping_notebook_files/libs/quarto-html/tippy.umd.min.js"></script>
  <script src="04_scraping_notebook_files/libs/quarto-html/anchor.min.js"></script>
  <link href="04_scraping_notebook_files/libs/quarto-html/tippy.css" rel="stylesheet">
  <link id="quarto-text-highlighting-styles" href="04_scraping_notebook_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet">
  <script src="04_scraping_notebook_files/libs/bootstrap/bootstrap.min.js"></script>
  <link href="04_scraping_notebook_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
  <link href="04_scraping_notebook_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet">
  <!--[if lt IE 9]>
    <script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
  <![endif]-->
</head>
<body class="fullcontent">
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">

<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">


<div class="quarto-title"><h1 class="title display-7">Web scraping</h1><p class="subtitle lead">SICSS, 2022</p></div></header>
<section id="web-scraping-notebook" class="level1">
<h1>Web scraping notebook</h1>
<p>In this worksheet, I introduce you to how we might gather data through screen-scraping (or server-side) techniques; the second worksheet will go through an example of how we use API (or client-side) techniques for gathering data.</p>
<section id="tutorial-screen-scraping" class="level2">
<h2 class="anchored" data-anchor-id="tutorial-screen-scraping">Tutorial: Screen-scraping</h2>
<p>In this tutorial, you will learn how to summarise, aggregate, and analyze text in R:</p>
<ul>
<li>How to select elements of CSS using SelectorGadget (see <a href="https://rvest.tidyverse.org/articles/articles/selectorgadget.html">here</a> for a detailed overview)</li>
<li>How to use the <tt>rvest</tt> package to scrape parts of a webpage</li>
</ul>
</section>
<section id="setup" class="level2">
<h2 class="anchored" data-anchor-id="setup">Setup</h2>
<p>To practice these skills, we will use a series of webpages on the Internet Archive that host material collected at the Arab Spring protests in Egypt in 2011. The original website can be seen <a href="https://www.tahrirdocuments.org/">here</a> and below.</p>
<p><img src="images/tahrir_page.png" class="img-fluid" style="width:100.0%"></p>
</section>
<section id="load-data-and-packages" class="level2">
<h2 class="anchored" data-anchor-id="load-data-and-packages">Load data and packages</h2>
<p>Before proceeding, we’ll load the remaining packages we will need for this tutorial.</p>
<div class="cell">
<div class="sourceCode" id="cb1"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse) <span class="co"># loads dplyr, ggplot2, and others</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggthemes) <span class="co"># includes a set of themes to make your visualizations look nice!</span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(readr) <span class="co"># more informative and easy way to import data</span></span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(stringr) <span class="co"># to handle text elements</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(rvest) <span class="co">#for scraping</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>We can download the final dataset we will produce with:</p>
<div class="cell">
<div class="sourceCode" id="cb2"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>pamphdata <span class="ot">&lt;-</span> <span class="fu">read_csv</span>(<span class="st">"data/pamphlets_formatted_gsheets.csv"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stderr">
<pre><code>Rows: 523 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): title, text, tags, imageurl, imgID, image
dbl  (1): year
date (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.</code></pre>
</div>
</div>
<p>You can also view the formatted output of this scraping exercise, alongside images of the documents in question, in Google Sheets <a href="https://docs.google.com/spreadsheets/d/1rg2VTV6uuknpu6u-L5n7kvQ2cQ6e6Js7IHp7CaSKe90/edit?usp=sharing">here</a>.</p>
<p>If you’re working on this document from your own computer (“locally”) you can download the Tahrir documents data in the following way:</p>
<div class="cell">
<div class="sourceCode" id="cb4"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>pamphdata <span class="ot">&lt;-</span> <span class="fu">read_csv</span>(<span class="st">"https://raw.githubusercontent.com/cjbarrie/sicss_21/main/01_scraping_APIs/data/pamphlets_formatted_gsheets.csv"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="inspect-and-filter-data" class="level2">
<h2 class="anchored" data-anchor-id="inspect-and-filter-data">Inspect and filter data</h2>
<p>Let’s have a look at what we will end up producing:</p>
<div class="cell">
<div class="sourceCode" id="cb5"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(pamphdata)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code># A tibble: 6 × 8
  title                        date        year text  tags  imageurl imgID image
  &lt;chr&gt;                        &lt;date&gt;     &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt; &lt;chr&gt;    &lt;chr&gt; &lt;chr&gt;
1 The Season of Anger Sets in… 2011-03-30  2011 The … Soli… https:/… imgI… =Arr…
2 The Most Important Workers’… 2011-03-30  2011 [Voi… Soli… https:/… imgI… &lt;NA&gt; 
3 Yes it’s the Workers’ and E… 2011-03-30  2011 [Voi… Soli… https:/… imgI… &lt;NA&gt; 
4 The Revolution is Still Ong… 2011-03-30  2011 [Voi… Revo… https:/… imgI… &lt;NA&gt; 
5 Voice of the Revolution, #3  2011-03-30  2011 Febr… Revo… https:/… imgI… &lt;NA&gt; 
6 We Are Still Continuing Unt… 2011-03-29  2011 We A… Dema… https:/… imgI… &lt;NA&gt; </code></pre>
</div>
</div>
</section>
<section id="inspecting-html-contents" class="level2">
<h2 class="anchored" data-anchor-id="inspecting-html-contents">Inspecting HTML contents</h2>
<p>We are going to return to the Internet Archived webpages to see how we can produce this final formatted dataset. The archived Tahrir Documents webpages can be accessed <a href="https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/">here</a>.</p>
<p>We first want to expect how the contents of each webpage is stored.</p>
<p>When we scroll to the very bottom of the page, we see listed a number of hyperlinks to documents stored by month:</p>
<p><img src="images/tahrir_archives.png" class="img-fluid"></p>
<p>We will click through the documents stored for March and then click on the top listed pamphlet entitled “The Season of Anger Sets in Among the Arab Peoples.” You can access this <a href="https://wayback.archive-it.org/2358/20120130161341/http://www.tahrirdocuments.org/2011/03/voice-of-the-revolution-3-page-2/">here</a>.</p>
<p>We will store this url to inspect the HTML it contains as follows:</p>
<div class="cell">
<div class="sourceCode" id="cb7"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>url <span class="ot">&lt;-</span> <span class="st">"https://wayback.archive-it.org/2358/20120130161341/http://www.tahrirdocuments.org/2011/03/voice-of-the-revolution-3-page-2/"</span></span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>html <span class="ot">&lt;-</span> <span class="fu">read_html</span>(url)</span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>html</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code>{html_document}
&lt;html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en-US"&gt;
[1] &lt;head profile="http://gmpg.org/xfn/11"&gt;\n&lt;!-- Start Wayback Rewrite JS In ...
[2] &lt;body&gt; \n&lt;!--\n     FILE ARCHIVED ON 16:13:41 Jan 30, 2012 AND RETRIEVED  ...</code></pre>
</div>
</div>
<p>Well, this isn’t particularly useful. Let’s now see how we can extract the text contained inside.</p>
<div class="cell">
<div class="sourceCode" id="cb9"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>pagetext <span class="ot">&lt;-</span> html <span class="sc">%&gt;%</span></span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">html_text</span>()</span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>pagetext</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code>[1] "\nif (window._WBWombatInit) {\n  _wb_uc = new URL(\"http://www.tahrirdocuments.org/2011/03/voice-of-the-revolution-3-page-2/\");\n  wbinfo = {}\n  wbinfo.url = \"http://www.tahrirdocuments.org/2011/03/voice-of-the-revolution-3-page-2/\";\n  wbinfo.timestamp = \"20120130161341\";\n  wbinfo.request_ts = \"20120130161341\";\n  wbinfo.prefix = \"https://wayback.archive-it.org/2358/\";\n  wbinfo.mod = \"if_\";\n  wbinfo.is_framed = false;\n  wbinfo.is_live = false;\n  wbinfo.coll = \"2358\";\n  wbinfo.proxy_magic = \"\";\n  wbinfo.static_prefix = \"/static/\";\n  wbinfo.enable_auto_fetch = true;\n  wbinfo.auto_fetch_worker_prefix = \"https://wayback.archive-it.org/2358/\";\n  wbinfo.wombat_ts = \"20120130161341\";\n  wbinfo.wombat_sec = \"1327940021\";\n  wbinfo.wombat_scheme = ( _wb_uc.protocol || 'http').replace(/:$/, '');\n  wbinfo.wombat_host = _wb_uc.host;\n  wbinfo.wombat_opts = {\n    no_rewrite_prefixes: [\n                            \"https://wayback.archive-it.org/2358/\",\n                            \"//archive-it.org/\",\n                            \"https://partner.archive-it.org/\",\n                          ]\n  };\n  window._WBWombatInit(wbinfo);\n\n  // variables useful for rulesengine rewrites from old ait-client_rewrite.js\n  WB_RewriteUrl = _wb_wombat.rewrite_url;\n  WB_ExtractOrig = _wb_wombat.extract_orig;\n  WB_wombat_self_location = window.WB_wombat_location\n}\n  The Season of Anger Sets in Among the Arab Peoples//&lt;![CDATA[\n\t// Google Analytics for WordPress by Yoast v4.0.10 | http://yoast.com/wordpress/google-analytics/\n\tvar _gaq = _gaq || [];\n\t_gaq.push(['_setAccount','UA-7521051-7']);\n\t_gaq.push(['_trackPageview']);\n\t(function() {\n\t\tvar ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;\n\t\tga.src = ('https:' == document.location.protocol ? 'https://wayback.archive-it.org/2358/20120130161341/https://ssl' : 'https://wayback.archive-it.org/2358/20120130161341/http://www') + '.google-analytics.com/ga.js';\n\t\tvar s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);\n\t})();\n\t//]]&gt;\nbody { background-color: #ececec; }\n \n\n\n\n  #wm-media-curtain {\n    display: none !important;\n    width: 100vw !important;\n    height: 100vh !important;\n    grid-template-columns: 5% 90% 5%;\n    z-index: 2000000001 !important;\n    background-color: #000000f7 !important;\n    line-height:normal !important;\n    padding: 0 !important;\n    color: #000 !important;\n    position: fixed !important;\n    top: 0 !important;\n    left: 0 !important;\n    font-size: medium !important;\n    font-family: sans-serif !important;\n    font-weight: normal !important;\n  }\n  #wm-media-prev, #wm-media-next {\n    align-self: center !important;\n    justify-self: center !important;\n    color: #fff !important;\n    font-size: 3rem !important;\n    opacity: 0.3 !important;\n  }\n  #wm-media-prev {\n    grid-column: 1 !important;\n    grid-row: 1 / 3 !important;\n    cursor: pointer !important;\n  }\n  #wm-media-next {\n    grid-column: 3 !important;\n    grid-row: 1 / 3 !important;\n    cursor: pointer !important;\n  }\n  #wm-media-close-button {\n    align-self: start !important;\n    justify-self: center !important;\n    color: #fff !important;\n    font-size: 3rem !important;\n    opacity: 0.3 !important;\n    grid-column: 3 !important;\n    grid-row: 1 !important;\n    cursor: pointer !important;\n  }\n  #wm-media-close-button:hover, #wm-media-close-button:active,\n  #wm-media-close-button:focus, #wm-media-prev:hover,\n  #wm-media-prev:active, #wm-media-prev:focus, #wm-media-next:hover,\n  #wm-media-next:active, #wm-media-next:focus {\n    opacity: 0.9 !important;\n  }\n  #wm-media-player-container {\n    align-self: center !important;\n    justify-self: center !important;\n    color: #fff !important;\n    grid-row: 1 / 3 !important;\n    grid-column: 2 !important;\n  }\n  #wm-media-page-number {\n    align-self: end !important;\n    justify-self: center !important;\n    color: #fff !important;\n    grid-row: 2 !important;\n    grid-column: 2 !important;\n    padding-bottom: 3vh !important;\n  }\n  #wm-media-title, #wm-media-description {\n    /* width: 50vw !important; */\n    margin: 1em auto !important;\n  }\n  #wm-media-title {\n    font-weight: bold !important;\n  }\n\n  &lt;\n  \n    &nbsp;\n    &nbsp;\n    &nbsp;\n  \n  &nbsp;\n  \n    ×\n  &gt;\n\n\n\n#wm-disclaim {\ndisplay:none;\nline-height:normal !important;\nborder:1px solid #000 !important;\npadding:5px !important;\nposition:relative !important;\nz-index: 2147483643 !important;\ncolor:#000 !important;\n\nbackground-color:lightYellow !important;\n\nfont-size:medium !important;\nfont-family:sans-serif !important;\nfont-weight:normal !important;\ntext-align:center !important;\n}\n\n#wm-disclaim a {\ncolor:#00f !important;\ntext-decoration:underline !important;\nfont-size:medium !important;\nfont-weight:normal !important;\n}\n\n#wm-disclaim a:hover {\nbackground-color: transparent !important;\n}\n\n#wm-disclaim-hide {\nfloat:right !important;\nmargin:0 0 5px 5px !important;\nborder:1px solid #ccc !important;\npadding:1px 5px !important;\ncursor:default !important;\nfont-size:x-small !important;\nfont-weight:bold !important;\ncolor:#666 !important;\n}\n#wm-disclaim-hide:hover {\nborder:1px outset #ccc !important;\n}\n#wm-disclaim-hide:focus, #wm-disclaim-hide:active {\nborder:1px inset #ccc !important;\n}\n\n#wm-disclaim-img {\nmargin-top: -8px !important;\nfloat: left !important;\nvertical-align:middle !important;\npadding: 0px !important;\n}\n\nhide\n\nYou are viewing an archived web page, collected at the request of American University in Cairo using Archive-It. This page was captured  on 16:13:41 Jan 30, 2012,\nand is part of the Egypt Politics and Revolution collection.\nThe information on this web page may be out of date. See All versions of this archived page.\nLoading media information\n\n                Enable QA\n\nView Missing URLs\n\n\n//&lt;![CDATA[\n//lazily loading AIT metadata link generation script\nvar lazyLoader = function(evt)\n{\n  document.getElementById('lazyScript').src = 'https://partner.archive-it.org/metadata_link/2358/http%3A//www.tahrirdocuments.org/2011/03/voice-of-the-revolution-3-page-2/';\n  document.getElementById('loadMissingCountScript').src = 'https://partner.archive-it.org/missing_url_record?getJS=true&amp;type=count&amp;collId=2358&amp;checkBefore=20220612195028&amp;cPage=http%3A%2F%2Fwww.tahrirdocuments.org%2F2011%2F03%2Fvoice-of-the-revolution-3-page-2%2F&amp;timestamp=20120130161341';\n};\n\n//ie8 and below do not support addEventListener\nif (navigator.appName.indexOf('MSIE 7') &gt; 0){\n  //alert(\"msie\");\n}\n\nif (window.addEventListener){\n  window.addEventListener(\"load\", lazyLoader, true);\n} else if (window.attachEvent){\n  window.attachEvent(\"onload\", lazyLoader);\n}\n//]]&gt;\n\nvar disclaimBanner = document.getElementById(\"wm-disclaim\");\nif(disclaimBanner != null) {\n  disclaimElement(disclaimBanner);\n}\n\nfunction fixUpWBBanner() {\n  var wb_banner = document.getElementById(\"wm-disclaim\");\n\n  if (wb_banner) {\n    if (document.body.firstChild !== wb_banner) {\n      document.body.insertBefore(wb_banner, document.body.firstChild);\n    }\n  }\n}\n\nsetInterval(fixUpWBBanner, 2000);\n\n  var __wmAllMedia = [];\n  var __wmArchivedMedia = [];\n  var __wmPlayerIndex = -1;\n\n  function __wmMediaNext() {\n    if (__wmPlayerIndex + 1 &lt; __wmArchivedMedia.length) {\n      __wmPlayerIndex++;\n      __wmUpdateTheaterState();\n    }\n  }\n\n  function __wmMediaPrev() {\n    if (__wmPlayerIndex &gt; 0) {\n      __wmPlayerIndex--;\n      __wmUpdateTheaterState();\n    }\n  }\n\n  function __wmUpdateTheaterState() {\n    var activeMedia = __wmArchivedMedia[__wmPlayerIndex];\n\n    if (activeMedia.title) {\n      document.getElementById(\"wm-media-title\").textContent = activeMedia.title;\n    }\n    if (activeMedia.description) {\n      document.getElementById(\"wm-media-description\").textContent = activeMedia.description;\n    }\n    document.getElementById(\"wm-media-page-number\").textContent =\n      (__wmPlayerIndex + 1) + \" of \" + __wmArchivedMedia.length;\n\n    var newPlayerE;\n    if (activeMedia.isAudio) {\n      newPlayerE = document.createElement(\"audio\");\n    } else {\n      newPlayerE = document.createElement(\"video\");\n      if (activeMedia.wbThumb) {\n        newPlayerE.setAttribute(\"poster\", activeMedia.wbThumb);\n      }\n      newPlayerE.addEventListener(\"loadeddata\", function(e) {\n        document.getElementById(\"wm-media-title\").style.setProperty(\"width\", newPlayerE.videoWidth + \"px\", \"important\");\n        document.getElementById(\"wm-media-description\").style.setProperty(\"width\", newPlayerE.videoWidth + \"px\", \"important\");\n      });\n    }\n    newPlayerE.id = \"wm-media-player\";\n    newPlayerE.setAttribute(\"controls\", \"true\");\n\n    var oldPlayerE = document.getElementById(\"wm-media-player\");\n\n    newPlayerE.addEventListener(\"error\", function(e) {\n      if (newPlayerE.error) { // sometimes this event fires for no reason???\n        var errorE = document.createElement(\"div\");\n        errorE.id = \"wm-media-player\";\n        errorE.textContent = \"Failed to load media (it may not have been captured). \";\n        var linkE = document.createElement(\"a\");\n        linkE.setAttribute(\"href\", activeMedia.wbUrl);\n        linkE.textContent = \"Details\";\n        errorE.appendChild(linkE);\n        var oldPlayerE = document.getElementById(\"wm-media-player\");\n        oldPlayerE.parentElement.replaceChild(errorE, oldPlayerE);\n      }\n    });\n\n    newPlayerE.src = activeMedia.wbUrl;\n    var oldPlayerE = document.getElementById(\"wm-media-player\");\n    oldPlayerE.parentElement.replaceChild(newPlayerE, oldPlayerE);\n\n    if (__wmPlayerIndex &gt; 0) {\n      document.getElementById(\"wm-media-prev\").style.setProperty(\"display\", \"initial\", \"important\");\n    } else {\n      document.getElementById(\"wm-media-prev\").style.setProperty(\"display\", \"none\", \"important\");\n    }\n\n    if (__wmPlayerIndex + 1 &lt; __wmArchivedMedia.length) {\n      document.getElementById(\"wm-media-next\").style.setProperty(\"display\", \"initial\", \"important\");\n    } else {\n      document.getElementById(\"wm-media-next\").style.setProperty(\"display\", \"none\", \"important\");\n    }\n  }\n\n(function() {\n  var mediaInfoLoaded = false;\n  var mediaPlaced = false;\n  var pageUrl = 'http://www.tahrirdocuments.org/2011/03/voice-of-the-revolution-3-page-2/';\n  var ydlJsonUrl = 'https://wayback.archive-it.org/2358/20120130161341/youtube-dl:http://www.tahrirdocuments.org/2011/03/voice-of-the-revolution-3-page-2/';\n  var totalAudio;\n  var totalVideo;\n  var archivedAudio;\n  var archivedVideo;\n\n  function countEndingDots(s) {\n    var i = s.length - 1;\n    var count = 0;\n    while (i &gt;= 0 &amp;&amp; s[i] == '.') {\n      count += 1;\n      i -= 1;\n    }\n    return count;\n  }\n\n  function updateButtonWhileLoading() {\n    var oldText = document.getElementById('wm-media-button').textContent;\n    var oldDots = countEndingDots(oldText);\n    var newText;\n    if (oldDots &gt;= 3) {\n      newText = oldText.substring(0, oldText.length - oldDots);\n    } else {\n      newText = oldText + \".\";\n    }\n    document.getElementById('wm-media-button').textContent = newText;\n\n  }\n  var buttonUpdater = setInterval(updateButtonWhileLoading, 500);\n\n  function finalizeButton() {\n    clearInterval(buttonUpdater);\n    // \"Found 5 archived media items out of 5 total in page. Play Archived Media\"\n    var text = \"Found \" + __wmArchivedMedia.length + \" archived media \"\n      + (__wmArchivedMedia.length == 1 ? \"item\" : \"items\") + \" out of \"\n      + __wmAllMedia.length + \" total on this page. \";\n    var e = document.getElementById(\"wm-media-button\");\n    e.textContent = text;\n    e.style.setProperty(\"color\", \"#000\", \"important\");\n    if (__wmArchivedMedia &amp;&amp; __wmArchivedMedia.length) {\n      var playE = document.createElement(\"span\");\n      playE.textContent = \"▶️\";\n      playE.style.setProperty(\"cursor\", \"pointer\", \"important\");\n      playE.addEventListener(\"click\", function(e) {\n        document.getElementById(\"wm-media-curtain\").style.setProperty(\"display\", \"grid\", \"important\");\n      });\n      e.appendChild(playE);\n    }\n  }\n\n  function placeMediaOnPageIfReady(e) {\n    console.log(e.type + \" document.readyState=\" + document.readyState);\n    if (e.type === \"mediaInfoLoaded\") {\n      mediaInfoLoaded = true;\n    }\n    if (!mediaPlaced &amp;&amp; mediaInfoLoaded &amp;&amp; document.readyState === \"complete\") {\n      mediaPlaced = true;\n      if (__wmArchivedMedia &amp;&amp; __wmArchivedMedia.length) {\n        placeMediaOnPage(__wmArchivedMedia)\n      }\n    }\n  }\n  document.addEventListener(\"mediaInfoLoaded\", placeMediaOnPageIfReady);\n  document.addEventListener(\"readystatechange\", placeMediaOnPageIfReady);\n\n  function prepareMedia() {\n    window.fetch(ydlJsonUrl)\n      .then(function(response) {\n        return response.json();\n      })\n      .catch(function(e) {\n        return null;\n      })\n      .then(function(ydlJson) {\n        if (ydlJson) {\n          __wmAllMedia = enumerateMedia(ydlJson);\n        }\n        return __wmAllMedia;\n      })\n      .then(function(allMedia) {\n        var allMediaPromises = [];\n        if (allMedia &amp;&amp; allMedia.length) {\n          allMediaPromises = checkIfArchived(allMedia);\n        }\n        return allMediaPromises;\n      })\n      .then(function(allMedia) {   // promises are resolved now\n        __wmAllMedia = allMedia;\n        if (allMedia &amp;&amp; allMedia.length) {\n          __wmArchivedMedia = allMedia.filter(m =&gt; m.isArchived);\n        }\n        document.dispatchEvent(new Event(\"mediaInfoLoaded\"));\n        return __wmArchivedMedia;\n      })\n      .then(function(archivedMedia) {\n        if (archivedMedia &amp;&amp; archivedMedia.length) {\n          __wmPlayerIndex = 0;\n          __wmUpdateTheaterState();\n        }\n        return archivedMedia;\n      })\n      .then(function(archivedMedia) {\n        finalizeButton();\n      })\n      .catch(function(err) {\n        console.warn(\"error loading or setting up media stuff\", err);\n      });\n\n    document.addEventListener('keydown', function(e) {\n      if (e.keyCode == 27) { // esc\n        document.getElementById('wm-media-curtain').style.setProperty('display', 'none', 'important');\n      } else if (e.keyCode == 37) { // left\n        __wmMediaPrev();\n      } else if (e.keyCode == 39) { // right\n        __wmMediaNext();\n      }\n    });\n  }\n\n  function enumerateMedia(ydlJson) {\n    var entries = ydlJson.entries || [ydlJson];\n    var media = [];\n\n    for (var i = 0; i &lt; entries.length; i++) {\n      var entry = entries[i];\n      var url = entry.url.slice(entry.url.indexOf('/http')+1);\n      if (entry.protocol != \"https\" &amp;&amp; entry.protocol != \"http\") {\n        var nnnnn = String(i + 1).padStart(5, \"0\");\n        url = entry.webpage_url.slice(entry.webpage_url.indexOf('/http')+1);\n        url = \"youtube-dl:\" + nnnnn + \":\" + url;\n      }\n      media.push({\n        liveUrl: url,\n        wbUrl: ydlJsonUrl.replace(/youtube-dl:.*/, url),\n        isAudio: entry.format &amp;&amp; entry.format.includes(\"audio only\"),\n        width: entry.width,\n        height: entry.height,\n        wbThumb: ydlJsonUrl.replace(/youtube-dl:.*/, entry.thumbnail.slice(entry.thumbnail.indexOf('/http')+1)),\n        title: entry.title,\n        description: entry.description\n      });\n    }\n\n    return media;\n  }\n\n  /* sets isArchived: true/false for each media element */\n  function checkIfArchived(media) {\n    var promises = [];\n    for (var i = 0; i &lt; media.length; i++) {\n      var promise = (function(mediaItem) {\n        return window.fetch(mediaItem.wbUrl, {method: \"HEAD\"})\n          .then(function(response) {\n            mediaItem.isArchived = response.ok;\n            return mediaItem;\n          });\n      })(media[i]);\n      promises.push(promise);\n    }\n    var allPromise = Promise.all(promises); // wait for all to resolve\n    return allPromise;\n  }\n\n  function findElementsToReplace() {\n    var result = [];\n    var elements = document.querySelectorAll(\"audio,video,object,embed,iframe\");\n    for (var i = 0; i &lt; elements.length; i++) {\n      var e = elements[i];\n      if (e.id == \"wm-media-player\") {\n        continue; // don't replace lightbox player!\n      } else if (e.tagName == \"IFRAME\") {\n        if (e.src.indexOf(\"youtube.com/embed/\") &gt; 0 || e.src.indexOf(\"player.vimeo.com/video\") &gt; 0) {\n          result.push(e);\n        }\n      } else {\n        result.push(e);\n      }\n    }\n    return result;\n  }\n\n  function placeMediaOnPage(media) {\n    var elementsToReplace = findElementsToReplace();\n    for (var i = 0; i &lt; elementsToReplace.length &amp;&amp; i &lt; media.length; i++) {\n      if (!media[i].isArchived)\n        continue;\n\n      var mediaE = document.createElement(!media[i].isAudio ? \"video\" : \"audio\");\n      mediaE.setAttribute(\"controls\", \"true\");\n      // mediaE.setAttribute(\"style\", \"width: 100%; height: 100%\");\n      mediaE.setAttribute(\"style\", \"width: 100%; height: auto\");\n      mediaE.src = media[i].wbUrl;\n\n      elementsToReplace[i].parentElement.replaceChild(mediaE, elementsToReplace[i]);\n    }\n  }\n\n  prepareMedia();\n})();\n\n\n\n  \n    \n\n      \n        \n      \n            \n      &nbsp;\n\n    \n  \n  \n  \n    \n\n\n      \n      \n      About\nRevolution\nLogistics\n\tRevolutionary Newspapers\n\tDemands\n\tCalls to Protest\n\nPolitics\nWafd Party\n\tThe Popular Committees for the Defense of the Revolution\n\tThe Party of the Popular Socialist Alliance\n\tThe Muslim Brotherhood\n\tThe National Progressive Unionist Party (Hizb al-Tagammu’)\n\tThe Justice Party\n\tOther Parties\n\tThe Egyptian Communist Party\n\tCandidates\n\nSolidarity\nWorkers\n\tUnions\n\tPalestine\n\tMovements\n\tLibya\n\nCulture\nPoetry\n\tMedia\n\tSigns from Tahrir\n\tHealth\n\tFiction\n\nConstitution\nMarch Referendum\n\tTheory\n\tDostour Newsletters\n\tConstitution First Movement\n\nRegime\nMubarak and Family\n\tPolice\n\tPrisoners\n\tSecurity Forces\n\nReligion\nAl-Azhar\n\tCoptic Christians\n\tFamily\n\tMoral Conduct\n\tSalafism\n\tSectarian Strife\n\n\n      \n       \n\n      &nbsp;\n\n    \n  \n  \n  \n    \n\n      \n      \n            \n        \n        \n                        \n            The Season of Anger Sets in Among the Arab Peoples\n            \n            \n            \n              March 30, 2011 9:43 pm                                Solidarity                no comments\n              &nbsp;\n            \n            \n            \n            \n            \n            \n\nClick here to download the original.\nThe Season of Anger Sets in Among the Arab Peoples\n&nbsp;\nA member of the Algerian opposition warned the Arab rulers that the Tunisian revolution would initiate a revolutionary tsunami phenomenon that would sweep away their thrones. At the beginning of last December, the end of the dictatorial regimes began with successive protests against high prices in Algeria and Jordan. Then the Tunisian revolution demanded the fall of the regime. Algeria caught the ball of flames, and protests ignited again after having calmed for a time. Then after the Tunisian president was expelled, the revolution reached Egypt. And when Egyptians deposed Mubarak, the sparks proceeded to Algeria, Jordan, Yemen, Bahrain, and then Libya.\n&nbsp;\nThe issue is not one of infection; rather, the subjugated peoples discovered the potential of their own free will. The revolution in one country inspired the populations in other countries, for people imprison themselves in waiting, hoping and yearning, until their spirits emerge carrying the roar of anger that shakes thrones. Every population transfers the experience of other revolutions, borrowing their slogans—’the people want the fall of the regime,’ and ‘peaceful, peaceful’—and their tactics, like the Friday of Anger in Jordan and sit-ins held in major squares located in middle of the capital, such as Egypt’s Tahrir Square. Tunisian revolutionaries contacted Egyptians, providing them with some tactics to beat the regime’s machine, like spraying the windshields of armored cars with colors to paralyze and get the better of them.\n&nbsp;\nBeyond inspiration and transmitting experience, there is also solidarity, for the Tunisian revolution’s victory celebrations spread to Jordan and Egypt. And most of the Arab peoples participated in Egypt’s joy. There were many moving scenes, like the singing of the national anthem and the distribution of sweets and drinks in Jordan and Gaza in celebration of Mubarak’s deposal.\n&nbsp;\nHowever, the regimes that consolidate their interests over and against the people’s also rely on one another and exchange their experiences with oppression and harassment. And now confrontations are raging in Bahrain, Yemen, and Libya as mummified regimes committing increased acts of violence and brutal murder against the protesters. Just as Egypt’s extinct regime did, they use thugs with soft weaponry to transform the scene into bloody chaos. Meanwhile, Israel threatens chaos at the regional level in support of its spies, the Arab rulers, since the one who benefits most from the bowing down of the Arab peoples and their backwardness is none other than Israel.\n&nbsp;\n\nThe solidarity and support of the Arab revolutions for each other is the destiny of the Arab region and the only way to protect the interests and wealth of its peoples.&nbsp; With the extinct regime issuing threats—as if from another world—that Omar Suleiman, Mubarak, or Ria and Sikina will return, the duty to protect our revolution has doubled. Our resistance won’t simply bring about complete, permanent victory for our own revolution; it will also support the revolutions of other populations who support us in turn. This is our duty and our destiny. For the populations who were long patient, the path of regression to what came before was no longer available. Their victory will soon be achieved through their resistance and free will.\n\n&nbsp;\n________________________\nAcquired March 2011\nTranslated by Yasmeen Mekawy\nTranslation reviewed by Emily Drumsta\n&nbsp;\n&nbsp;\nRelated posts:Call for the Military to Impeach MubarakA Very Important Proposal from the Coalition of the Youth of the Revolution in the City of Matai–al-...The Struggle Movement            \t\t\t\t                \n            \n            \n            &nbsp;\n            \n                        \n                 Share this post\n                    \n                    \n                    \n                    \n                    \n                    \n                    \n                    \n                 &nbsp;\n            \n                                \n        \n        \n        &nbsp;\n        \n        « The Most Important Workers’ Protests The Popular Alliance Party: Foundational Declaration »         \n        \n\n\n\n\n\n\n\n\n\n            \n      \n      \n      \n      \n                \n        Follow Us on Facebook &amp; Twitter&nbsp;About\t\t\tTahrir Documents is an ongoing effort to archive and translate activist papers from the 2011 Egyptian uprising and its aftermath. Materials are collected from demonstrations in Cairo’s Tahrir Square and published in complete English translation alongside scans of the original documents. The project is not affiliated with any political organization, Egyptian or otherwise.\nFor more information please contact tahrirdocuments@gmail.com\n\n\t\t&nbsp;CategoriesAl-Azhar\nCalls to Protest\nCandidates\nConstitution\nConstitution First Movement\nCulture\nDemands\nDostour Newsletters\nFamily\nFiction\nHealth\nLibya\nLogistics\nMarch Referendum\nMedia\nMilitary\nMilitary Tribunals\nMoral Conduct\nMovements\nMubarak and Family\nOther Parties\nPalestine\nPoetry\nPolice\nPolitics\nPrisoners\nRegime\nReligion\nRevolution\nRevolutionary Newspapers\nSalafism\nSectarian Strife\nSecurity Forces\nSigns from Tahrir\nSolidarity\nThe Egyptian Communist Party\nThe Justice Party\nThe Muslim Brotherhood\nThe National Progressive Unionist Party (Hizb al-Tagammu')\nTheory\nThe Party of the Popular Socialist Alliance\nThe Popular Committees for the Defense of the Revolution\nUnions\nWafd Party\nWorkers\n&nbsp;&nbsp;        &nbsp;\n      \n      \n      \n      &nbsp;\n\n    \n  \n\n\n  \n    &nbsp;\n    \n\n      \n\n      Archives\t\tJanuary 2012\n\tDecember 2011\n\tNovember 2011\n\tOctober 2011\n\tSeptember 2011\n\tAugust 2011\n\tJuly 2011\n\tJune 2011\n\tMay 2011\n\tApril 2011\n\tMarch 2011\n\t\t&nbsp;      &nbsp;\n\n\t   \n\t   \n\t   \n\t     \n\t     Tahrir Documents\t\tAbout\nContact\n\t\t&nbsp;\t     &nbsp;\n\t     \n\t   \n\t   \n\t   \n\t     \n\t      \t     &nbsp;\n\t     \n\t   \n\n\t   \n\t     \n\t     Search\n\tSearch for:\n\t\n\t&nbsp;\t     &nbsp;\n\t     \n\t   \n\t   \n\t   &nbsp;\n\n    \n  \n  \n  \n    \n      Designed by \n      Copyright © 2012 Tahrir Documents. All rights reserved.\n    \n  \n    \n\n\n\n\n  var _gaq = _gaq || [];\n  _gaq.push(['_setAccount', 'UA-7521051-7']);\n  _gaq.push(['_trackPageview']);\n\n  (function() {\n    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;\n    ga.src = ('https:' == document.location.protocol ? 'https://wayback.archive-it.org/2358/20120130161341/https://ssl' : 'https://wayback.archive-it.org/2358/20120130161341/http://www') + '.google-analytics.com/ga.js';\n    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);\n  })();\n\n"</code></pre>
</div>
</div>
<p>Well this looks pretty terrifying now…</p>
<p>We need a way of quickly identifying where the relevant text is so that we can specify this when we are scraping. The most widely-used tool to achieve this is the “Selector Gadget” Chrome Extension. You can add this to your browser for free <a href="https://chrome.google.com/webstore/detail/selectorgadget/mhjhnkcfbdhnjickkkdbjoemdmbfginb?hl=en">here</a>.</p>
<p>The tool works by allowing the user to point and click on elements of a webpage (or “CSS selectors”). Unlike alternatives, such as “Inspect Element” browser tools, we are easily able to see how the webpage item is contained within CSS selectors (rather than HTML tags alone), which is easier to parse.</p>
<p>We can do this with our Tahrir documents as below:</p>
<p><img src="images/gifcap4.gif" class="img-fluid" style="width:100.0%"></p>
<p>So now we know that the main text of the translated document is contained between “p” HTML tags. To identify the text between these HTML tags we can run:</p>
<div class="cell">
<div class="sourceCode" id="cb11"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>pagetext <span class="ot">&lt;-</span> html <span class="sc">%&gt;%</span></span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">html_elements</span>(<span class="st">"p"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">html_text</span>(<span class="at">trim=</span><span class="cn">TRUE</span>)</span>
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>pagetext</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code> [1] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
 [2] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
 [3] "Click here to download the original."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
 [4] "The Season of Anger Sets in Among the Arab Peoples"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
 [5] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
 [6] "A member of the Algerian opposition warned the Arab rulers that the Tunisian revolution would initiate a revolutionary tsunami phenomenon that would sweep away their thrones. At the beginning of last December, the end of the dictatorial regimes began with successive protests against high prices in Algeria and Jordan. Then the Tunisian revolution demanded the fall of the regime. Algeria caught the ball of flames, and protests ignited again after having calmed for a time. Then after the Tunisian president was expelled, the revolution reached Egypt. And when Egyptians deposed Mubarak, the sparks proceeded to Algeria, Jordan, Yemen, Bahrain, and then Libya."                                                                                                                                                                                      
 [7] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
 [8] "The issue is not one of infection; rather, the subjugated peoples discovered the potential of their own free will. The revolution in one country inspired the populations in other countries, for people imprison themselves in waiting, hoping and yearning, until their spirits emerge carrying the roar of anger that shakes thrones. Every population transfers the experience of other revolutions, borrowing their slogans—’the people want the fall of the regime,’ and ‘peaceful, peaceful’—and their tactics, like the Friday of Anger in Jordan and sit-ins held in major squares located in middle of the capital, such as Egypt’s Tahrir Square. Tunisian revolutionaries contacted Egyptians, providing them with some tactics to beat the regime’s machine, like spraying the windshields of armored cars with colors to paralyze and get the better of them."
 [9] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
[10] "Beyond inspiration and transmitting experience, there is also solidarity, for the Tunisian revolution’s victory celebrations spread to Jordan and Egypt. And most of the Arab peoples participated in Egypt’s joy. There were many moving scenes, like the singing of the national anthem and the distribution of sweets and drinks in Jordan and Gaza in celebration of Mubarak’s deposal."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
[11] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
[12] "However, the regimes that consolidate their interests over and against the people’s also rely on one another and exchange their experiences with oppression and harassment. And now confrontations are raging in Bahrain, Yemen, and Libya as mummified regimes committing increased acts of violence and brutal murder against the protesters. Just as Egypt’s extinct regime did, they use thugs with soft weaponry to transform the scene into bloody chaos. Meanwhile, Israel threatens chaos at the regional level in support of its spies, the Arab rulers, since the one who benefits most from the bowing down of the Arab peoples and their backwardness is none other than Israel."                                                                                                                                                                               
[13] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
[14] "The solidarity and support of the Arab revolutions for each other is the destiny of the Arab region and the only way to protect the interests and wealth of its peoples.&nbsp; With the extinct regime issuing threats—as if from another world—that Omar Suleiman, Mubarak, or Ria and Sikina will return, the duty to protect our revolution has doubled. Our resistance won’t simply bring about complete, permanent victory for our own revolution; it will also support the revolutions of other populations who support us in turn. This is our duty and our destiny. For the populations who were long patient, the path of regression to what came before was no longer available. Their victory will soon be achieved through their resistance and free will."                                                                                                          
[15] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
[16] "________________________"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
[17] "Acquired March 2011"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
[18] "Translated by Yasmeen Mekawy"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
[19] "Translation reviewed by Emily Drumsta"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
[20] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
[21] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
[22] "« The Most Important Workers’ Protests The Popular Alliance Party: Foundational Declaration »"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
[23] "Tahrir Documents is an ongoing effort to archive and translate activist papers from the 2011 Egyptian uprising and its aftermath. Materials are collected from demonstrations in Cairo’s Tahrir Square and published in complete English translation alongside scans of the original documents. The project is not affiliated with any political organization, Egyptian or otherwise."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
[24] "For more information please contact tahrirdocuments@gmail.com"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
[25] "Designed by"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
[26] "Copyright © 2012 Tahrir Documents. All rights reserved."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    </code></pre>
</div>
</div>
<p>`<code>r emo::ji("relieved_face")`😌😌😌😌😌😌😌😌😌</code></p>
<p>, which looks quite a lot more manageable…!</p>
<p>What is happening here? Essentially, the <code>html_elements()</code> function is scanning the page and collecting all HTML elements contained between <code>&lt;p&gt;</code> tags, which we collect using the “p” CSS selector. We are then just grabbing the text contained in this part of the page with the <code>html_text()</code> function.</p>
<p>So this gives us one way of capturing the text, but what about if we wanted to get other elements of the document, for example the date or the tags attributed to each document? Well we can do the same thing here too. Let’s take the example of getting the date:</p>
<p><img src="images/gifcap5.gif" class="img-fluid" style="width:100.0%"></p>
<p>We see here that the date is identified by the “.calendar” CSS selector and so we enter this into the same <code>html_elements()</code> function as before:</p>
<div class="cell">
<div class="sourceCode" id="cb13"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>pagedate <span class="ot">&lt;-</span> html <span class="sc">%&gt;%</span> </span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">html_elements</span>(<span class="st">".calendar"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">html_text</span>(<span class="at">trim=</span><span class="cn">TRUE</span>)</span>
<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a>pagedate</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code>[1] "March 30, 2011 9:43 pm"</code></pre>
</div>
</div>
<p>Of course, this is all well and good, but we also need a way of doing this at scale—we can’t just keep repeating the same process for every page we find as this wouldn’t be much quicker than just copy pasting. So how can we do this? Well we need first to understand the URL structure of the website in question.</p>
</section>
<section id="inspecting-url-structures" class="level2">
<h2 class="anchored" data-anchor-id="inspecting-url-structures">Inspecting URL structures</h2>
<p>When we scroll down the page we see listed a number of documents. Each of these directs to an individual pamphlet distributed at protests during the 2011 Egyptian Revolution.</p>
<p>Click on one of these and see how the URL changes.</p>
<p>We see that if our starting URL was:</p>
<div class="cell">

</div>
<div class="cell">
<div class="cell-output-stdout">
<pre><code>https://wayback.archive-it.org/2358/20120130135111/http://www.tahrirdocuments.org/</code></pre>
</div>
</div>
<p>Then if we click on March 2011, the first month for which we have documents, we see that the url becomes:</p>
<div class="cell">

</div>
<div class="cell">
<div class="cell-output-stdout">
<pre><code>https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/</code></pre>
</div>
</div>
<p>, for August 2011 it becomes:</p>
<div class="cell">
<div class="cell-output-stdout">
<pre><code>https://wayback.archive-it.org/2358/20120130142155/http://www.tahrirdocuments.org/2011/08/</code></pre>
</div>
</div>
<p>, and for January 2012 it becomes:</p>
<div class="cell">
<div class="cell-output-stdout">
<pre><code>https://wayback.archive-it.org/2358/20120130142014/http://www.tahrirdocuments.org/2012/01/</code></pre>
</div>
</div>
<p>We notice that for each month, the URL changes with the addition of month and year between back slashes at the end or the URL. In the next section, we will go through how to efficiently create a set of URLs to loop through and retrieve the information contained in each individual webpage.</p>
</section>
<section id="looping-through-dates" class="level2">
<h2 class="anchored" data-anchor-id="looping-through-dates">Looping through dates</h2>
<p>We are going to want to retrieve the text of documents archived for each month. As such, our first task is to store each of these webpages as a series of strings. We could do this manually by, for example, pasting year and month strings to the end of each URL for each month from March, 2011 to January, 2012:</p>
<div class="cell">
<div class="sourceCode" id="cb19"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>url <span class="ot">&lt;-</span> <span class="st">"https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/"</span></span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>url1 <span class="ot">&lt;-</span> <span class="fu">paste0</span>(url,<span class="st">"2011/03/"</span>)</span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a>url2 <span class="ot">&lt;-</span> <span class="fu">paste0</span>(url,<span class="st">"2011/04/"</span>)</span>
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>url3 <span class="ot">&lt;-</span> <span class="fu">paste0</span>(url,<span class="st">"2011/04/"</span>)</span>
<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a><span class="co">#etc...</span></span>
<span id="cb19-8"><a href="#cb19-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb19-9"><a href="#cb19-9" aria-hidden="true" tabindex="-1"></a>urls <span class="ot">&lt;-</span> <span class="fu">c</span>(url1, url2, url3)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>But this wouldn’t be particularly efficient…</p>
<p>Instead, we can wrap all of this in a loop.</p>
<div class="cell">
<div class="sourceCode" id="cb20"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>urls <span class="ot">&lt;-</span> <span class="fu">character</span>(<span class="dv">0</span>)</span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">3</span><span class="sc">:</span><span class="dv">13</span>) {</span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a>  url <span class="ot">&lt;-</span> <span class="st">"https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/"</span></span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a>  newurl <span class="ot">&lt;-</span> <span class="fu">ifelse</span>(i <span class="sc">&lt;</span><span class="dv">10</span>, <span class="fu">paste0</span>(url,<span class="st">"2011/0"</span>,i,<span class="st">"/"</span>), </span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>                   <span class="fu">ifelse</span>(i<span class="sc">&gt;=</span><span class="dv">10</span> <span class="sc">&amp;</span> i<span class="sc">&lt;=</span><span class="dv">12</span> , <span class="fu">paste0</span>(url,<span class="st">"2011/"</span>,i,<span class="st">"/"</span>), </span>
<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>                          <span class="fu">paste0</span>(url,<span class="st">"2012/01/"</span>)))</span>
<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a>  urls <span class="ot">&lt;-</span> <span class="fu">c</span>(urls, newurl)</span>
<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>What’s going on here? Well, we are first specifying the starting URL as above. We are then iterating through the numbers 3 to 13. And we are telling R to take the new URL and then, depending on the number in the loop we are on, to take the base starting url— https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/ — and to paste on the end of it the string “2011/0”, then the number of the loop we are on, and then “/”. So, for the first “i” in the loop—the number 3—then we are effectively calling the equivalent of:</p>
<div class="cell">
<div class="sourceCode" id="cb21"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>i <span class="ot">&lt;-</span> <span class="dv">3</span></span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>url <span class="ot">&lt;-</span> <span class="st">"https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/"</span></span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a>newurl <span class="ot">&lt;-</span> <span class="fu">paste0</span>(url,<span class="st">"2011/0"</span>,i,<span class="st">"/"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Which gives:</p>
<div class="cell">
<div class="cell-output-stdout">
<pre><code>[1] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/"</code></pre>
</div>
</div>
<p>In the above, the <code>ifelse()</code> commands are simply telling R: if i (the number of the loop we are on) is less than 10 then <code>paste0(url,"2011/0",i,"/")</code>; i.e., if i is less than 10 then paste “2011/0”, then “i” and then “/”. So for the number 3 this becomes:</p>
<p><code>"https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/"</code></p>
<p>, and for the number 4 this becomes</p>
<p><code>"https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/04/"</code></p>
<p>If, however, <code>i&gt;=10 &amp; i&lt;=12</code> (i is greater than or equal to 10 and less than or equal to 12) then we are calling <code>paste0(url,"2011/",i,"/")</code> because here we do not need the first “0” in the months.</p>
<p>Finally, if (else) i is greater than 12 then we are calling <code>paste0(url,"2012/01/")</code>. For this last call, notice, we do not have to specify whether i is greater than or equal to 12 because we are wrapping everything in <code>ifelse()</code> commands. With <code>ifelse()</code> calls like this, we are telling R if x “meets condition” then do y, otherwise do z. When we are wrapping multiple <code>ifelse()</code> calls within each other, we are effectively telling R if x “meets condition” then do y, or if x “meets other condition” then do z, otherwise do a. So here, the “otherwise do a” part of the <code>ifelse()</code> calls is saying: if i is not less than 10, and is not between 10 and 12, then paste “2012/01/” to the end of the URL.</p>
<p>Got it? I didn’t even get it on first reading… and I wrote it. The best way to understand what is going on is to run this code yourself and look at what each part is doing.</p>
</section>
<section id="looping-through-pages" class="level2">
<h2 class="anchored" data-anchor-id="looping-through-pages">Looping through pages</h2>
<p>So now we have our list of URLs for each month. What next?</p>
<p>Well if we go onto the page of a particular month, let’s say March, we will see that the page has multiple paginated tabs at the bottom. Let’s see what happens to the URL when we click on one of these:</p>
<div class="cell">

</div>
<p>We see that if our starting point URL for March, as above, was:</p>
<div class="cell">
<div class="cell-output-stdout">
<pre><code>https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/</code></pre>
</div>
</div>
<p>When we click through to page 2 it becomes:</p>
<div class="cell">
<div class="cell-output-stdout">
<pre><code>https://wayback.archive-it.org/2358/20120130163651/http://www.tahrirdocuments.org/2011/03/page/2/</code></pre>
</div>
</div>
<p>And for page 3 it becomes:</p>
<div class="cell">
<div class="cell-output-stdout">
<pre><code>https://wayback.archive-it.org/2358/20120130163651/http://www.tahrirdocuments.org/2011/03/page/3/</code></pre>
</div>
</div>
<p>We can see pretty clearly that as we navigate through each page, there appears appended to the URL the string “page/2/” and “page/3/”. So this shouldn’t be too tricky to add to our list of URLs. But we want to avoid having to manually click through the archive for each month to figure out how many pagination tabs are at the bottom of each page.</p>
<p>Fortunately, we don’t have to. Using the “Selector Gadget” tool again we can automate this process by grabbing the highest number that appears in the pagination bar for each month’s pages. The code below achieves this:</p>
<div class="cell">
<div class="sourceCode" id="cb26"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>urlpages_all <span class="ot">&lt;-</span> <span class="fu">character</span>(<span class="dv">0</span>) <span class="co">#create empty character string to deposit our final set of urls</span></span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>urlpages <span class="ot">&lt;-</span> <span class="fu">character</span>(<span class="dv">0</span>) <span class="co">#create empty character string to deposit our urls for each page of each month</span></span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="fu">seq_along</span>(urls)) { <span class="co">#for loop for each url stored above</span></span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a>  url <span class="ot">&lt;-</span> urls[i] <span class="co">#take the first url from the vector of urls created above</span></span>
<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a>  html <span class="ot">&lt;-</span> <span class="fu">read_html</span>(url) <span class="co">#read the html</span></span>
<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a>  pages <span class="ot">&lt;-</span> html <span class="sc">%&gt;%</span></span>
<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_elements</span>(<span class="st">".page"</span>) <span class="sc">%&gt;%</span> <span class="co">#grab the page element</span></span>
<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_text</span>() <span class="co">#convert to text</span></span>
<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a>  pageints <span class="ot">&lt;-</span> <span class="fu">as.integer</span>(pages) <span class="co">#convert to set of integers</span></span>
<span id="cb26-10"><a href="#cb26-10" aria-hidden="true" tabindex="-1"></a>  npages <span class="ot">&lt;-</span> <span class="fu">max</span>(pageints, <span class="at">na.rm =</span> T) <span class="co">#get number of highest integer</span></span>
<span id="cb26-11"><a href="#cb26-11" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb26-12"><a href="#cb26-12" aria-hidden="true" tabindex="-1"></a>  <span class="cf">for</span> (j <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span>npages) { <span class="co">#for loop for each of 1:highest page integer for that month's url</span></span>
<span id="cb26-13"><a href="#cb26-13" aria-hidden="true" tabindex="-1"></a>  newurl <span class="ot">&lt;-</span> <span class="fu">paste0</span>(url,<span class="st">"page/"</span>,j,<span class="st">"/"</span>) <span class="co">#create new url by pasting "page/" and then the number of that page, and then "/", matching the url structure identified above</span></span>
<span id="cb26-14"><a href="#cb26-14" aria-hidden="true" tabindex="-1"></a>  urlpages <span class="ot">&lt;-</span> <span class="fu">c</span>(urlpages, newurl) <span class="co">#bind with previously created page urls for each month</span></span>
<span id="cb26-15"><a href="#cb26-15" aria-hidden="true" tabindex="-1"></a>  }</span>
<span id="cb26-16"><a href="#cb26-16" aria-hidden="true" tabindex="-1"></a>  urlpages_all <span class="ot">&lt;-</span> <span class="fu">c</span>(urlpages_all, urlpages) <span class="co">#bind the monthly page by page urls together</span></span>
<span id="cb26-17"><a href="#cb26-17" aria-hidden="true" tabindex="-1"></a>  urlpages <span class="ot">&lt;-</span> <span class="fu">character</span>(<span class="dv">0</span>) <span class="co">#empty urlpages for next iteration of the first for loop</span></span>
<span id="cb26-18"><a href="#cb26-18" aria-hidden="true" tabindex="-1"></a>  urlpages_all <span class="ot">&lt;-</span> <span class="fu">gsub</span>(<span class="st">"page/1/"</span>, <span class="st">""</span>, urlpages_all) <span class="co">#get rid of page/1/ as not needed</span></span>
<span id="cb26-19"><a href="#cb26-19" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<div class="cell">

</div>
<p>What’s going on here? Well, in the first two lines, we are simply creating an empty character string that we’re going to populate in the subsequent loop. Remember that we have a set of eleven starting URLs for each of months archived on this webpage.</p>
<p>So in the code beginning <code>for (i in seq_along(files)</code> we saying, similar to above, for the beginning url to the end url, do the following in a loop: first, read in the url with <code>url &lt;- urls[i]</code> then read the html it contains with <code>html &lt;- read_html(url)</code>.</p>
<p>After this line, we are getting the pages as a character vector of page numbers by calling the <code>html_elements()</code> function on the “.page” tag. this gives a series of pages stored as e.g.&nbsp;“1” “2” “3”.</p>
<p>In order to be able to see how many there are, we need to extract the highest number that appears in this string. To do this, we first need to reformat it as an “integer” object rather than a “character” object so that R can recognize that these are numbers. So we call <code>pageints &lt;- as.integer(pages)</code>. Then we get the maximum by simply calling: <code>npages &lt;- max(pageints, na.rm = T)</code>.</p>
<p>In the next part of the loop, we are taking the new information we have stored as “npages,” i.e., the number of pagination tabs for each month, and telling R: for each of these pages, define a new url by adding “page/” then the number of the pagination tab “j”, and then “/”. After we’ve bound all of these together, we get a list of URLs that look like this:</p>
<div class="cell">
<div class="sourceCode" id="cb27"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(urlpages_all)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code>[1] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/"       
[2] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/page/2/"
[3] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/page/3/"
[4] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/page/4/"
[5] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/page/5/"
[6] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/page/6/"</code></pre>
</div>
</div>
</section>
<section id="looping-through-every-page-on-the-website" class="level2">
<h2 class="anchored" data-anchor-id="looping-through-every-page-on-the-website">Looping through every page on the website</h2>
<p>So what next?</p>
<p>The next step is to get the URLs for each of the documents contained in the archive for each month. How do we do this? Well, we can once again use the “Selector Gadget” tool to work this out. For the main landing pages of each month, we see listed, as below, each document in a list. For each of these documents, we see that the title, which links to the revolutionary leaflet in question, has two CSS selectors: “h2” and “.post”.</p>
<p><img src="images/gifcap6.gif" class="img-fluid" style="width:100.0%"></p>
<p>We can again pass these tags through <code>html_elements()</code> to grab what’s contained inside. We can then grab what’s contained inside these by extracting the “children” of these classes. In essence, this just means a lower level tag: tags can have tags within tags and these flow downwards like a family tree (hence the name, I suppose).</p>
<p>So one of the “children” of this HTML tag is the link contained inside, which we can get with calling <code>html_children()</code> followed by specifying that we want the specific attribute of the web link it encloses with <code>html_attr("href")</code>. The subsequent lines then just remove extraneous information.</p>
<p>The complete loop, then, to retrieve the URL of the page for every leaflet contained on this website is:</p>
<div class="cell">
<div class="sourceCode" id="cb29"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="co">#GET URLS FOR EACH PAMPHLET</span></span>
<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>pamlinks_all <span class="ot">&lt;-</span> <span class="fu">character</span>(<span class="dv">0</span>)</span>
<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="fu">seq_along</span>(urlpages_all)) {</span>
<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a>  url <span class="ot">&lt;-</span> urlpages_all[i]</span>
<span id="cb29-6"><a href="#cb29-6" aria-hidden="true" tabindex="-1"></a>  html <span class="ot">&lt;-</span> <span class="fu">read_html</span>(url)</span>
<span id="cb29-7"><a href="#cb29-7" aria-hidden="true" tabindex="-1"></a>  links <span class="ot">&lt;-</span> <span class="fu">html_elements</span>(html, <span class="st">".post , h2"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb29-8"><a href="#cb29-8" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_children</span>() <span class="sc">%&gt;%</span></span>
<span id="cb29-9"><a href="#cb29-9" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_attr</span>(<span class="st">"href"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb29-10"><a href="#cb29-10" aria-hidden="true" tabindex="-1"></a>    <span class="fu">na.omit</span>() <span class="sc">%&gt;%</span></span>
<span id="cb29-11"><a href="#cb29-11" aria-hidden="true" tabindex="-1"></a>    <span class="st">`</span><span class="at">attributes&lt;-</span><span class="st">`</span>(<span class="cn">NULL</span>)</span>
<span id="cb29-12"><a href="#cb29-12" aria-hidden="true" tabindex="-1"></a>  pamlinks_all <span class="ot">&lt;-</span> <span class="fu">c</span>(pamlinks_all, links)</span>
<span id="cb29-13"><a href="#cb29-13" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<div class="cell">

</div>
<p>Which gives us:</p>
<div class="cell">
<div class="sourceCode" id="cb30"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(pamlinks_all)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code>[1] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/voice-of-the-revolution-3-page-2/"                       
[2] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/the-most-important-workers-protests/"                    
[3] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/yes-its-the-workers-and-employees-right-to-strike/"      
[4] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/the-revolution-is-still-ongoing-2/"                      
[5] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/voice-of-the-revolution-the-revolution-is-still-ongoing/"
[6] "https://wayback.archive-it.org/2358/20120130143023/http://www.tahrirdocuments.org/2011/03/we-are-still-continuing-until-victory/"                  </code></pre>
</div>
<div class="sourceCode" id="cb32"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="fu">length</span>(pamlinks_all)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code>[1] 523</code></pre>
</div>
</div>
<p>We see now that we have collected all 523 separate URLs for every revolutionary leaflet contained on these pages. Now we’re in a great position to be able to crawl each page and collect the information we need. This final loop is all we need to go through each URL we’re interested in and collect relevant information on document text, title, date, tags, and the URL to the image of the revolutionary literature itself.</p>
<p>See if you can work out yourselves how each part of this is fitting together. NOTE: if you want to run the final loop on your own machines it will take several hours to complete.</p>
<div class="cell">
<div class="sourceCode" id="cb34"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>df_empty <span class="ot">&lt;-</span> <span class="fu">data.frame</span>()</span>
<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="fu">seq_along</span>(pamlinks_all)) {</span>
<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a>  url <span class="ot">&lt;-</span> pamlinks_all[i]</span>
<span id="cb34-4"><a href="#cb34-4" aria-hidden="true" tabindex="-1"></a>  html <span class="ot">&lt;-</span> <span class="fu">read_html</span>(url)</span>
<span id="cb34-5"><a href="#cb34-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">cat</span>(<span class="st">"Collecting url number "</span>,i,<span class="st">": "</span>, url, <span class="st">"</span><span class="sc">\n</span><span class="st">"</span>)</span>
<span id="cb34-6"><a href="#cb34-6" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-7"><a href="#cb34-7" aria-hidden="true" tabindex="-1"></a>  error <span class="ot">&lt;-</span> <span class="fu">tryCatch</span>(html <span class="ot">&lt;-</span> <span class="fu">read_html</span>(url),</span>
<span id="cb34-8"><a href="#cb34-8" aria-hidden="true" tabindex="-1"></a>                    <span class="at">error=</span><span class="cf">function</span>(e) e)</span>
<span id="cb34-9"><a href="#cb34-9" aria-hidden="true" tabindex="-1"></a>  <span class="cf">if</span> (<span class="fu">inherits</span>(error, <span class="st">'error'</span>)) {</span>
<span id="cb34-10"><a href="#cb34-10" aria-hidden="true" tabindex="-1"></a>    df <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">title =</span> <span class="cn">NA</span>,</span>
<span id="cb34-11"><a href="#cb34-11" aria-hidden="true" tabindex="-1"></a>                     <span class="at">date =</span> <span class="cn">NA</span>,</span>
<span id="cb34-12"><a href="#cb34-12" aria-hidden="true" tabindex="-1"></a>                     <span class="at">text =</span> <span class="cn">NA</span>,</span>
<span id="cb34-13"><a href="#cb34-13" aria-hidden="true" tabindex="-1"></a>                     <span class="at">imageurl =</span> <span class="cn">NA</span>,</span>
<span id="cb34-14"><a href="#cb34-14" aria-hidden="true" tabindex="-1"></a>                     <span class="at">tags =</span> <span class="cn">NA</span>)</span>
<span id="cb34-15"><a href="#cb34-15" aria-hidden="true" tabindex="-1"></a>    <span class="cf">next</span></span>
<span id="cb34-16"><a href="#cb34-16" aria-hidden="true" tabindex="-1"></a>  }</span>
<span id="cb34-17"><a href="#cb34-17" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-18"><a href="#cb34-18" aria-hidden="true" tabindex="-1"></a>  df <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="fu">matrix</span>(<span class="at">ncol=</span><span class="dv">0</span>, <span class="at">nrow=</span><span class="fu">length</span>(<span class="dv">1</span>)))</span>
<span id="cb34-19"><a href="#cb34-19" aria-hidden="true" tabindex="-1"></a>  <span class="co">#get titles</span></span>
<span id="cb34-20"><a href="#cb34-20" aria-hidden="true" tabindex="-1"></a>  titles <span class="ot">&lt;-</span> <span class="fu">html_elements</span>(html, <span class="st">".title"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb34-21"><a href="#cb34-21" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_text</span>(<span class="at">trim=</span><span class="cn">TRUE</span>)</span>
<span id="cb34-22"><a href="#cb34-22" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-23"><a href="#cb34-23" aria-hidden="true" tabindex="-1"></a>  title <span class="ot">&lt;-</span> titles[<span class="dv">1</span>]</span>
<span id="cb34-24"><a href="#cb34-24" aria-hidden="true" tabindex="-1"></a>  df<span class="sc">$</span>title <span class="ot">&lt;-</span> title</span>
<span id="cb34-25"><a href="#cb34-25" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-26"><a href="#cb34-26" aria-hidden="true" tabindex="-1"></a>  <span class="co">#get date</span></span>
<span id="cb34-27"><a href="#cb34-27" aria-hidden="true" tabindex="-1"></a>  date <span class="ot">&lt;-</span> <span class="fu">html_elements</span>(html, <span class="st">".calendar"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb34-28"><a href="#cb34-28" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_text</span>(<span class="at">trim=</span><span class="cn">TRUE</span>)</span>
<span id="cb34-29"><a href="#cb34-29" aria-hidden="true" tabindex="-1"></a>  df<span class="sc">$</span>date <span class="ot">&lt;-</span> date</span>
<span id="cb34-30"><a href="#cb34-30" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-31"><a href="#cb34-31" aria-hidden="true" tabindex="-1"></a>  <span class="co">#get text</span></span>
<span id="cb34-32"><a href="#cb34-32" aria-hidden="true" tabindex="-1"></a>  textsep <span class="ot">&lt;-</span>  <span class="fu">html_elements</span>(html, <span class="st">"p"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb34-33"><a href="#cb34-33" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_text</span>(<span class="at">trim=</span><span class="cn">TRUE</span>)</span>
<span id="cb34-34"><a href="#cb34-34" aria-hidden="true" tabindex="-1"></a>  text <span class="ot">&lt;-</span> <span class="fu">paste</span>(textsep, <span class="at">collapse =</span> <span class="st">","</span>)</span>
<span id="cb34-35"><a href="#cb34-35" aria-hidden="true" tabindex="-1"></a>  df<span class="sc">$</span>text <span class="ot">&lt;-</span> text</span>
<span id="cb34-36"><a href="#cb34-36" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-37"><a href="#cb34-37" aria-hidden="true" tabindex="-1"></a>  <span class="co">#get tags</span></span>
<span id="cb34-38"><a href="#cb34-38" aria-hidden="true" tabindex="-1"></a>  pamtags <span class="ot">&lt;-</span> <span class="fu">html_elements</span>(html, <span class="st">".category"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb34-39"><a href="#cb34-39" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_text</span>(<span class="at">trim=</span><span class="cn">TRUE</span>)</span>
<span id="cb34-40"><a href="#cb34-40" aria-hidden="true" tabindex="-1"></a>  df<span class="sc">$</span>tags <span class="ot">&lt;-</span> pamtags</span>
<span id="cb34-41"><a href="#cb34-41" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-42"><a href="#cb34-42" aria-hidden="true" tabindex="-1"></a>  <span class="co">#get link to original pamphlet image</span></span>
<span id="cb34-43"><a href="#cb34-43" aria-hidden="true" tabindex="-1"></a>  elements_other <span class="ot">&lt;-</span>  <span class="fu">html_elements</span>(html, <span class="st">"a"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb34-44"><a href="#cb34-44" aria-hidden="true" tabindex="-1"></a>    <span class="fu">html_children</span>()</span>
<span id="cb34-45"><a href="#cb34-45" aria-hidden="true" tabindex="-1"></a>  url_element <span class="ot">&lt;-</span> <span class="fu">as.character</span>(elements_other[<span class="dv">2</span>])</span>
<span id="cb34-46"><a href="#cb34-46" aria-hidden="true" tabindex="-1"></a>  imgurl <span class="ot">&lt;-</span> <span class="fu">str_extract</span>(url_element, <span class="st">"src=</span><span class="sc">\\</span><span class="st">S+"</span>)</span>
<span id="cb34-47"><a href="#cb34-47" aria-hidden="true" tabindex="-1"></a>  imgurl <span class="ot">&lt;-</span> <span class="fu">substr</span>(imgurl, <span class="dv">6</span>, (<span class="fu">nchar</span>(imgurl)<span class="sc">-</span><span class="dv">1</span>))</span>
<span id="cb34-48"><a href="#cb34-48" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-49"><a href="#cb34-49" aria-hidden="true" tabindex="-1"></a>  df<span class="sc">$</span>imageurl <span class="ot">&lt;-</span> imgurl</span>
<span id="cb34-50"><a href="#cb34-50" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb34-51"><a href="#cb34-51" aria-hidden="true" tabindex="-1"></a>  df_empty <span class="ot">&lt;-</span> <span class="fu">rbind</span>(df_empty, df)</span>
<span id="cb34-52"><a href="#cb34-52" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>And now… we’re pretty much there…back where we started!</p>
</section>
<section id="exercises" class="level2">
<h2 class="anchored" data-anchor-id="exercises">Exercises</h2>
<ul>
<li>Go to <a href="http://books.toscrape.com/" class="uri">http://books.toscrape.com/</a> and write a script that captures information on one of the genres on the left hand side. This information is up to you to select (could be e.g., links; titles; prices…).</li>
<li>(Harder) go to <a href="https://www.theguardian.com/politics" class="uri">https://www.theguardian.com/politics</a> and grab the title, author, and text of all today’s politics articles in The Guardian</li>
</ul>

</section>
</section>
</main>
<!-- /main column -->
<script type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    target: function(trigger) {
      return trigger.previousElementSibling;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    setTimeout(function() {
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn) {
    const config = {
      allowHTML: true,
      content: contentFn,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start'
    };
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      let href = ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const cites = ref.parentNode.getAttribute('data-cites').split(' ');
    tippyHover(ref, function() {
      var popup = window.document.createElement('div');
      cites.forEach(function(cite) {
        var citeDiv = window.document.createElement('div');
        citeDiv.classList.add('hanging-indent');
        citeDiv.classList.add('csl-entry');
        var biblioDiv = window.document.getElementById('ref-' + cite);
        if (biblioDiv) {
          citeDiv.innerHTML = biblioDiv.innerHTML;
        }
        popup.appendChild(citeDiv);
      });
      return popup.innerHTML;
    });
  }
});
</script>
</div> <!-- /content -->


</body></html>