Skip to content

Commit

Permalink
Merge pull request #5 from crispytx/b
Browse files Browse the repository at this point in the history
B
  • Loading branch information
crispytx committed May 30, 2016
2 parents fcdc368 + 7967c17 commit 8076a2c
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 93 deletions.
95 changes: 16 additions & 79 deletions tobycrawler31.php → tobycrawler32.php
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?php $MySQL = new mysqli('localhost', 'ADMIN', 'PASSWORD', 'DATABASE');
<?php $MySQL = new mysqli('localhost', 'USERNAME', 'PASSWORD', 'DATABASE');
$startTime = time();
$pages = 6; // Number of Pages to Crawl
function curlMultiRequest($urls, $options) {
$ch = array();
$results = array();
Expand All @@ -25,15 +26,14 @@ function curlMultiRequest($urls, $options) {
curl_multi_close($mh);
return $results;
}

$SQL = "SELECT * FROM locationTable;";
$output = $MySQL->query($SQL);
while ($record = $output->fetch_assoc()){
$lastIdScraped = $record['lastIdScraped'];
}
$scrapeNextId = $lastIdScraped;

for ($a = 0; $a < 20; $a++) {
for ($a = 0; $a < $pages; $a++) {
$scrapeNextId++;
$SQL = "SELECT domain FROM domainTable WHERE id = $scrapeNextId;";
$output = $MySQL->query($SQL);
Expand All @@ -56,7 +56,7 @@ function curlMultiRequest($urls, $options) {
*
*/
for ($z = 0; $z <= 3; $z++) {
for ($c = 0; $c < 20; $c++) {
for ($c = 0; $c < $pages; $c++) {
preg_match('/<title>(.*)<\/title>/i', $domainData[$z][$c], $title);
$titles[$z][$c] = $title[1];
}
Expand All @@ -66,66 +66,24 @@ function curlMultiRequest($urls, $options) {
$links[$z][$e][$f] = $hyperlink[0][$f];
}
}
/*
for ($g = 0; $g < count($titles[$z]); $g++) {
echo "<h2>LINKS FOR {$titles[$z][$g]} :</h2>";
for ($h = 0; $h < count($links[$z][$g]); $h++) {
echo " * {$links[$z][$g][$h]} <br>";
}
}
*/
}

$endTime = time();
$elapsedTime = $endTime - $startTime;
//echo "TOTAL TIME ELAPSED: $elapsedTime SECONDS <br>";
$fractionalTime = $elapsedTime / 20;
//echo "CRAWL TIME PER URL: $fractionalTime SECONDS <br>";

// CODE BELOW FOR TESTING PURPOSES ONLY
/*
echo "<h2>\$titles[0]:</h2>";
for ($x = 0; $x < count($titles[0]); $x++) {
echo "{$titles[0][$x]} <br>";
}
echo "TOTAL TIME ELAPSED: $elapsedTime SECONDS <br>";
$fractionalTime = $elapsedTime / $pages;
echo "CRAWL TIME PER URL: $fractionalTime SECONDS <br>";

echo "<h2>\$titles[1]:</h2>";
for ($x = 0; $x < count($titles[1]); $x++) {
echo "{$titles[1][$x]} <br>";
}
echo "<h2>\$titles[2]:</h2>";
for ($x = 0; $x < count($titles[2]); $x++) {
echo "{$titles[2][$x]} <br>";
}
echo "<h2>\$titles[3]:</h2>";
for ($x = 0; $x < count($titles[3]); $x++) {
echo "{$titles[3][$x]} <br>";
}
echo "<h1>LINKS:</h1>";
for ($i = 0; $i < count($links); $i++) {
for ($j = 0; $j < count($links[$i]); $j++) {
for ($k = 0; $k < count($links[$i][$j]); $k++) {
echo "{$links[$i][$j][$k]} <br>";
}
}
}
*/

for ($i = 0; $i < 20; $i++) {

for ($i = 0; $i < $pages; $i++) {
$prefixZero = count($links[0][$i]);
$prefixOne = count($links[1][$i]);
$prefixTwo = count($links[2][$i]);
$prefixThree = count($links[3][$i]);

$prefixSortArray[0] = $prefixZero;
$prefixSortArray[1] = $prefixOne;
$prefixSortArray[2] = $prefixTwo;
$prefixSortArray[3] = $prefixThree;

// BUBBLE SORT $prefixSortArray IN ORDER TO DETERMINE
// WHICH PREFIX+DOMAIN RETURNS THE MOST LINKS WHEN SCRAPED.
// THIS SHOULD SERVE AS A RESONABLE WAY TO DETERMINE WHICH
Expand Down Expand Up @@ -156,7 +114,6 @@ function curlMultiRequest($urls, $options) {
}
}
$mostLinks = $prefixSortArray[3];

switch ($mostLinks) {
case $prefixZero:
$bestLinks[$i] = $links[0][$i];
Expand All @@ -176,18 +133,9 @@ function curlMultiRequest($urls, $options) {
break;
}
}
// LOOP BELOW FOR TESTING PURPOSES ONLY
/*
for ($i = 0; $i < 20; $i++) {
echo "<h2>{$bestTitles[$i]} </h2>";
for ($j = 0; $j < count($bestLinks[$i]); $j++) {
echo "{$bestLinks[$i][$j]} <br>";
}
}
*/

// $bestLinks[0-19][variableAmount]
for ($i = 0; $i < 20; $i++) {
for ($i = 0; $i < $pages; $i++) {
for ($j = 1; $j < count($bestLinks[$i]); $j++) {
for ($k = 0; $k < $j; $k++) {
if ($bestLinks[$i][$j] == $bestLinks[$i][$k]) {
Expand All @@ -196,20 +144,9 @@ function curlMultiRequest($urls, $options) {
}
}
}

// CODE BELOW FOR TESTING PURPOSES ONLY
/*
for ($i = 0; $i < 20; $i++) {
echo "<h2>{$bestTitles[$i]} </h2>";
for ($j = 0; $j < count($bestLinks[$i]); $j++) {
if ($bestLinks[$i][$j] == "DUPLICATE") { continue; };
echo "{$bestLinks[$i][$j]} <br>";
}
}
*/

// MARK DOMAINS AS 'SCRAPED' ON DATABASE
for ($i = 0; $i < 20; $i++) {
for ($i = 0; $i < $pages; $i++) {
$SQL = "UPDATE domainTable SET scraped=\"true\" WHERE domain = \"{$domainNoPrefix[$i]}\";";
$MySQL->query($SQL);
}
Expand All @@ -221,7 +158,7 @@ function curlMultiRequest($urls, $options) {
$zId = $record['maxid'];
}
$zId++;
for ($i = 0; $i < 20; $i++) {
for ($i = 0; $i < $pages; $i++) {
for ($j = 0; $j < count($bestLinks[$i]); $j++) {
$SQL = "SELECT * FROM domainTable WHERE domain = \"{$bestLinks[$i][$j]}\";";
$output = $MySQL->query($SQL);
Expand All @@ -236,19 +173,19 @@ function curlMultiRequest($urls, $options) {


// MARK CRAWLED DOMAINS AS 'SCRAPED' ON THE domainTable
for ($i = 0; $i < 20; $i++) {
for ($i = 0; $i < $pages; $i++) {
$SQL = "UPDATE domainTable SET scraped=\"true\" WHERE domain = \"{$domainNoPrefix[$i]}\";";
$MySQL->query($SQL);
}

for ($i = 0; $i < 20; $i++) {
for ($i = 0; $i < $pages; $i++) {
$titleArray = explode(" ", $bestTitles[$i]);
$SQL = "INSERT INTO keywordTable VALUES (\"{$domainNoPrefix[$i]}\", \"{$bestTitles[$i]}\", \"{$titleArray[0]}\", \"{$titleArray[1]}\", \"{$titleArray[2]}\", \"{$titleArray[3]}\", \"{$titleArray[4]}\", \"{$titleArray[5]}\", \"{$titleArray[6]}\", \"{$titleArray[7]}\", \"{$titleArray[8]}\", \"{$titleArray[9]}\", \"{$titleArray[10]}\", \"{$titleArray[11]}\", \"{$titleArray[12]}\");";
$MySQL->query($SQL);
}

$newLastIdScraped = $lastIdScraped + 20;
$newLastIdScraped = $lastIdScraped + $pages;
$SQL = "UPDATE locationTable SET lastIdScraped = \"$newLastIdScraped\";";
$MySQL->query($SQL);

$MySQL->close(); ?>
$MySQL->close(); ?>
11 changes: 0 additions & 11 deletions tobycron31.php

This file was deleted.

3 changes: 0 additions & 3 deletions tobycron31.txt

This file was deleted.

3 changes: 3 additions & 0 deletions tobycron32.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
$ crontab -e

* * * * * php /home/USERNAMEHERE/public_html/asktoby/tobycrawler32.php > /dev/null 2>&1

0 comments on commit 8076a2c

Please sign in to comment.