diff --git a/tobycrawler31.php b/tobycrawler32.php
similarity index 74%
rename from tobycrawler31.php
rename to tobycrawler32.php
index ce5bf81..9a6c83b 100644
--- a/tobycrawler31.php
+++ b/tobycrawler32.php
@@ -1,5 +1,6 @@
-query($SQL);
while ($record = $output->fetch_assoc()){
$lastIdScraped = $record['lastIdScraped'];
}
$scrapeNextId = $lastIdScraped;
-
- for ($a = 0; $a < 20; $a++) {
+
+ for ($a = 0; $a < $pages; $a++) {
$scrapeNextId++;
$SQL = "SELECT domain FROM domainTable WHERE id = $scrapeNextId;";
$output = $MySQL->query($SQL);
@@ -56,7 +56,7 @@ function curlMultiRequest($urls, $options) {
*
*/
for ($z = 0; $z <= 3; $z++) {
- for ($c = 0; $c < 20; $c++) {
+ for ($c = 0; $c < $pages; $c++) {
preg_match('/
(.*)<\/title>/i', $domainData[$z][$c], $title);
$titles[$z][$c] = $title[1];
}
@@ -66,66 +66,24 @@ function curlMultiRequest($urls, $options) {
$links[$z][$e][$f] = $hyperlink[0][$f];
}
}
- /*
- for ($g = 0; $g < count($titles[$z]); $g++) {
- echo "LINKS FOR {$titles[$z][$g]} :
";
- for ($h = 0; $h < count($links[$z][$g]); $h++) {
- echo " * {$links[$z][$g][$h]}
";
- }
- }
- */
}
$endTime = time();
$elapsedTime = $endTime - $startTime;
- //echo "TOTAL TIME ELAPSED: $elapsedTime SECONDS
";
- $fractionalTime = $elapsedTime / 20;
- //echo "CRAWL TIME PER URL: $fractionalTime SECONDS
";
-
- // CODE BELOW FOR TESTING PURPOSES ONLY
- /*
- echo "\$titles[0]:
";
- for ($x = 0; $x < count($titles[0]); $x++) {
- echo "{$titles[0][$x]}
";
- }
+ echo "TOTAL TIME ELAPSED: $elapsedTime SECONDS
";
+ $fractionalTime = $elapsedTime / $pages;
+ echo "CRAWL TIME PER URL: $fractionalTime SECONDS
";
- echo "\$titles[1]:
";
- for ($x = 0; $x < count($titles[1]); $x++) {
- echo "{$titles[1][$x]}
";
- }
-
- echo "\$titles[2]:
";
- for ($x = 0; $x < count($titles[2]); $x++) {
- echo "{$titles[2][$x]}
";
- }
-
- echo "\$titles[3]:
";
- for ($x = 0; $x < count($titles[3]); $x++) {
- echo "{$titles[3][$x]}
";
- }
-
- echo "LINKS:
";
- for ($i = 0; $i < count($links); $i++) {
- for ($j = 0; $j < count($links[$i]); $j++) {
- for ($k = 0; $k < count($links[$i][$j]); $k++) {
- echo "{$links[$i][$j][$k]}
";
- }
- }
- }
- */
-
-for ($i = 0; $i < 20; $i++) {
+for ($i = 0; $i < $pages; $i++) {
$prefixZero = count($links[0][$i]);
$prefixOne = count($links[1][$i]);
$prefixTwo = count($links[2][$i]);
$prefixThree = count($links[3][$i]);
-
$prefixSortArray[0] = $prefixZero;
$prefixSortArray[1] = $prefixOne;
$prefixSortArray[2] = $prefixTwo;
$prefixSortArray[3] = $prefixThree;
-
// BUBBLE SORT $prefixSortArray IN ORDER TO DETERMINE
// WHICH PREFIX+DOMAIN RETURNS THE MOST LINKS WHEN SCRAPED.
// THIS SHOULD SERVE AS A RESONABLE WAY TO DETERMINE WHICH
@@ -156,7 +114,6 @@ function curlMultiRequest($urls, $options) {
}
}
$mostLinks = $prefixSortArray[3];
-
switch ($mostLinks) {
case $prefixZero:
$bestLinks[$i] = $links[0][$i];
@@ -176,18 +133,9 @@ function curlMultiRequest($urls, $options) {
break;
}
}
- // LOOP BELOW FOR TESTING PURPOSES ONLY
- /*
- for ($i = 0; $i < 20; $i++) {
- echo "{$bestTitles[$i]}
";
- for ($j = 0; $j < count($bestLinks[$i]); $j++) {
- echo "{$bestLinks[$i][$j]}
";
- }
- }
- */
// $bestLinks[0-19][variableAmount]
- for ($i = 0; $i < 20; $i++) {
+ for ($i = 0; $i < $pages; $i++) {
for ($j = 1; $j < count($bestLinks[$i]); $j++) {
for ($k = 0; $k < $j; $k++) {
if ($bestLinks[$i][$j] == $bestLinks[$i][$k]) {
@@ -196,20 +144,9 @@ function curlMultiRequest($urls, $options) {
}
}
}
-
- // CODE BELOW FOR TESTING PURPOSES ONLY
- /*
- for ($i = 0; $i < 20; $i++) {
- echo "{$bestTitles[$i]}
";
- for ($j = 0; $j < count($bestLinks[$i]); $j++) {
- if ($bestLinks[$i][$j] == "DUPLICATE") { continue; };
- echo "{$bestLinks[$i][$j]}
";
- }
- }
- */
// MARK DOMAINS AS 'SCRAPED' ON DATABASE
- for ($i = 0; $i < 20; $i++) {
+ for ($i = 0; $i < $pages; $i++) {
$SQL = "UPDATE domainTable SET scraped=\"true\" WHERE domain = \"{$domainNoPrefix[$i]}\";";
$MySQL->query($SQL);
}
@@ -221,7 +158,7 @@ function curlMultiRequest($urls, $options) {
$zId = $record['maxid'];
}
$zId++;
- for ($i = 0; $i < 20; $i++) {
+ for ($i = 0; $i < $pages; $i++) {
for ($j = 0; $j < count($bestLinks[$i]); $j++) {
$SQL = "SELECT * FROM domainTable WHERE domain = \"{$bestLinks[$i][$j]}\";";
$output = $MySQL->query($SQL);
@@ -236,19 +173,19 @@ function curlMultiRequest($urls, $options) {
// MARK CRAWLED DOMAINS AS 'SCRAPED' ON THE domainTable
- for ($i = 0; $i < 20; $i++) {
+ for ($i = 0; $i < $pages; $i++) {
$SQL = "UPDATE domainTable SET scraped=\"true\" WHERE domain = \"{$domainNoPrefix[$i]}\";";
$MySQL->query($SQL);
}
- for ($i = 0; $i < 20; $i++) {
+ for ($i = 0; $i < $pages; $i++) {
$titleArray = explode(" ", $bestTitles[$i]);
$SQL = "INSERT INTO keywordTable VALUES (\"{$domainNoPrefix[$i]}\", \"{$bestTitles[$i]}\", \"{$titleArray[0]}\", \"{$titleArray[1]}\", \"{$titleArray[2]}\", \"{$titleArray[3]}\", \"{$titleArray[4]}\", \"{$titleArray[5]}\", \"{$titleArray[6]}\", \"{$titleArray[7]}\", \"{$titleArray[8]}\", \"{$titleArray[9]}\", \"{$titleArray[10]}\", \"{$titleArray[11]}\", \"{$titleArray[12]}\");";
$MySQL->query($SQL);
}
- $newLastIdScraped = $lastIdScraped + 20;
+ $newLastIdScraped = $lastIdScraped + $pages;
$SQL = "UPDATE locationTable SET lastIdScraped = \"$newLastIdScraped\";";
$MySQL->query($SQL);
-$MySQL->close(); ?>
+$MySQL->close(); ?>
\ No newline at end of file
diff --git a/tobycron31.php b/tobycron31.php
deleted file mode 100644
index ffc62f0..0000000
--- a/tobycron31.php
+++ /dev/null
@@ -1,11 +0,0 @@
-
\ No newline at end of file
diff --git a/tobycron31.txt b/tobycron31.txt
deleted file mode 100644
index 809e5a2..0000000
--- a/tobycron31.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-$ crontab -e
-
-*/1 * * * * /usr/local/bin/php /home/USERNAMEHERE/public_html/asktoby/tobycron31.php > /dev/null 2>&1
diff --git a/tobycron32.txt b/tobycron32.txt
new file mode 100644
index 0000000..a27ee27
--- /dev/null
+++ b/tobycron32.txt
@@ -0,0 +1,3 @@
+$ crontab -e
+
+* * * * * php /home/USERNAMEHERE/public_html/asktoby/tobycrawler32.php > /dev/null 2>&1