Explore WP_URL_In_Text_Processor to find URLs in a string

adamziel · Jun 23, 2024 · 5beed41 · 5beed41
1 parent cb11c3a
commit 5beed41
Show file tree

Hide file tree

Showing 9 changed files with 1,822 additions and 5 deletions.
diff --git a/transfer-protocol/.phpunit.result.cache b/transfer-protocol/.phpunit.result.cache
diff --git a/transfer-protocol/bin/regenerate_public_suffix_list.php b/transfer-protocol/bin/regenerate_public_suffix_list.php
@@ -0,0 +1,40 @@
+<?php
+/**
+ * This script regenerates the public suffix list from the publicsuffix.org website.
+ */
+
+$suffixes = file_get_contents('https://publicsuffix.org/list/public_suffix_list.dat');
+$lines = explode("\n", $suffixes);
+$tlds = array();
+foreach ($lines as $line) {
+	if ( empty( $line ) || $line[0] === '/' ) {
+		continue;
+	}
+	if ( strpos( $line, '.' ) !== false ) {
+		continue;
+	}
+	$tlds[] = $line;
+}
+
+
+$php_file_path = __DIR__ . '/../src/public_suffix_list.php';
+
+$new_php_file_path = $php_file_path.'.swp';
+$fp = fopen($new_php_file_path, 'w');
+fwrite($fp, "<?php\n\n");
+fwrite($fp, "/**");
+fwrite($fp, "\n * Public suffix list for detecting URLs with known domains within text.");
+fwrite($fp, "\n * This file is automatically generated by regenerate_public_suffix_list.php.");
+fwrite($fp, "\n * Do not edit it directly.");
+fwrite($fp, "\n */\n\n");
+fwrite($fp, "return array(\n");
+foreach($tlds as $tld) {
+	fwrite($fp, "\t'".$tld."',\n");
+}
+
+fwrite($fp, ");\n");
+
+if(file_exists($php_file_path)) {
+	unlink($php_file_path);
+}
+rename($new_php_file_path, $php_file_path);
diff --git a/transfer-protocol/bootstrap.php b/transfer-protocol/bootstrap.php
@@ -22,4 +22,5 @@
 
 require_once __DIR__ . '/src/WP_Block_Markup_Processor.php';
 require_once __DIR__ . '/src/WP_Block_Markup_Url_Processor.php';
+require_once __DIR__ . '/src/WP_URL_In_Text_Processor.php';
 require_once __DIR__ . '/vendor/autoload.php';
diff --git a/transfer-protocol/run-tests.sh b/transfer-protocol/run-tests.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
-COMMAND="phpunit tests/WP_Block_Markup_Url*"
+COMMAND="phpunit tests/WP_URL_In_Text_Processor*"
 $COMMAND
 fswatch -o ./**/*.php | xargs -n1 -I{} $COMMAND
diff --git a/transfer-protocol/src/WP_Block_Markup_Url_Processor.php b/transfer-protocol/src/WP_Block_Markup_Url_Processor.php
@@ -37,9 +37,9 @@ private function next_url_in_current_token() {
 			case '#block-comment':
 				return $this->next_url_block_attribute();
 				break;
-			// case '#text':
-			//     $this->rewrite_text();
-			//     break;
+			case '#text':
+				return $this->next_url_in_text_node();
+				break;
 		}
 	}
 
@@ -51,6 +51,7 @@ public function next_token() {
 		return parent::next_token();
 	}
 
+
 	/**
 	 * A list of HTML attributes meant to contain URLs, as defined in the HTML specification.
 	 * It includes some deprecated attributes like `lowsrc` and `highsrc` for the `IMG` element.
@@ -173,6 +174,11 @@ private function next_url_block_attribute() {
 
 		do {
 			$url_maybe = $this->block_attributes_iterator->current();
+			// @TODO: Investigate why LEAVES_ONLY isn't enough
+			if ( is_array( $url_maybe ) ) {
+				$this->block_attributes_iterator->next();
+				continue;
+			}
 			if ( URL::canParse( $url_maybe, $this->base_url ) ) {
 				$this->current_block_attribute_key   = $this->block_attributes_iterator->key();
 				$this->current_block_attribute_value = $url_maybe;

diff --git a/transfer-protocol/src/WP_URL_In_Text_Processor.php b/transfer-protocol/src/WP_URL_In_Text_Processor.php
@@ -0,0 +1,266 @@
+<?php
+
+use Rowbot\URL\URL;
+
+/**
+ * Finds URLs in text nodes.
+ *
+ * Looks for URLs:
+ * * Starting with http:// or https://
+ * * Starting with //
+ * * Domain-only, e.g. www.example.com
+ * * Domain + path, e.g. www.example.com/path
+ *
+ * ### Domain names
+ *
+ * UTF-8 characters in the domain names are supported even if they're
+ * not encoded as punycode. For example, scanning the text:
+ *
+ * > Więcej na łąka.pl
+ *
+ * Would yield `łąka.pl`
+ *
+ * ### Paths
+ *
+ * The path is limited to ASCII characters, as per the URL specification.
+ * For example, scanning the text:
+ *
+ * > Visit the WordPress plugins directory https://w.org/plugins?łąka=1
+ *
+ * Would yield `https://w.org/plugins?`, not `https://w.org/plugins?łąka=1`.
+ * However, scanning this text:
+ *
+ * > Visit the WordPress plugins directory https://w.org/plugins?%C5%82%C4%85ka=1
+ *
+ * Would yield `https://w.org/plugins?%C5%82%C4%85ka=1`.
+ *
+ * ### Parenthesis treatment
+ *
+ * This scanner captures parentheses as a part of the path, query, or fragment, except
+ * when they're seen as the last character in the URL. For example, scanning the text:
+ *
+ * > Visit the WordPress plugins directory (https://w.org/plugins)
+ *
+ * Would yield `https://w.org/plugins`, but scanning the text:
+ *
+ * > Visit the WordPress plugins directory (https://w.org/plug(in)s
+ *
+ * Would yield `https://w.org/plug(in)s`.
+ *
+ */
+class WP_URL_In_Text_Processor {
+
+	private $text;
+	private $bytes_already_parsed;
+	private $url;
+	private $base_url = 'https://w.org';
+	private $regex;
+
+	private $strict = false;
+
+	static private $public_suffix_list;
+
+	/**
+	 * Characters that are forbidden in the host part of a URL.
+	 * See https://url.spec.whatwg.org/#host-miscellaneous.
+	 */
+	private const FORBIDDEN_HOST_BYTES = "\x00\x09\x0a\x0d\x20\x23\x2f\x3a\x3c\x3e\x3f\x40\x5b\x5c\x5d\x5e\x7c";
+	private const FORBIDDEN_DOMAIN_BYTES = "\x00\x09\x0a\x0d\x20\x23\x25\x2f\x3a\x3c\x3e\x3f\x40\x5b\x5c\x5d\x5e\x7c\x7f";
+	/**
+	 * Unlike RFC 3986, the WHATWG URL specification does not the domain part of
+	 * a URL to any length. That being said, we apply an arbitrary limit here as
+	 * an optimization to avoid scanning the entire text for a domain name.
+	 *
+	 * Rationale: Domains larger than 1KB are extremely rare. The WHATWG URL
+	 */
+	private const CONSIDER_DOMAINS_UP_TO_BYTES = 1024;
+
+	public function __construct( $text ) {
+		if ( ! self::$public_suffix_list ) {
+			self::$public_suffix_list = require_once __DIR__ . '/public_suffix_list.php';
+		}
+		$this->bytes_already_parsed = 0;
+		$this->text                 = $text;
+		// A reverse string is useful for lookups. It does not form a valid
+		// text since strrev doesn't support UTF-8, but that's okay. We're
+		// only interested in the byte positions.
+		// $this->text_rev = strrev($text);
+
+		$prefix = $this->strict ? '^' : '';
+		$suffix = $this->strict ? '$' : '';
+
+		// Source: https://github.com/vstelmakh/url-highlight/blob/master/src/Matcher/Matcher.php
+		$this->regex = '/' . $prefix . '
+            (?|                                                        # scheme
+                (?<scheme>[a-z][\w\-]+):\/{2}                              # scheme ending with :\/\/
+                |                                                          # or
+                (?<scheme>mailto):                                         # mailto
+            )?
+            (?:                                                        # userinfo
+                (?:
+                    (?<=\/{2})                                             # prefixed with \/\/
+                    |                                                      # or
+                    (?=[^\p{Sm}\p{Sc}\p{Sk}\p{P}])                         # start with not: mathematical, currency, modifier symbol, punctuation
+                )
+                (?<userinfo>[^\s<>@\/]+)                                   # not: whitespace, < > @ \/
+                @                                                          # at
+            )?
+            (?=[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}])                   # followed by valid host char
+            (?|                                                        # host
+                (?<host>                                                   # host prefixed by scheme or userinfo (less strict)
+                    (?<=\/{2}|@)                                               # prefixed with \/\/ or @
+                    (?=[^\-])                                                  # label start, not: -
+                    (?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63}           # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
+                    (?<=[^\-])                                                 # label end, not: -
+                    (?:                                                        # more label parts
+                        \.
+                        (?=[^\-])                                                  # label start, not: -
+                        (?<tld>(?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63})   # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
+                        (?<=[^\-])                                                 # label end, not: -
+                    )*
+                )
+                |                                                          # or
+                (?<host>                                                   # host with tld (no scheme or userinfo)
+                    (?=[^\-])                                                  # label start, not: -
+                    (?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63}           # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
+                    (?<=[^\-])                                                 # label end, not: -
+                    (?:                                                        # more label parts
+                        \.
+                        (?=[^\-])                                                  # label start, not: -
+                        (?:[^\p{Z}\p{Sm}\p{Sc}\p{Sk}\p{C}\p{P}]|-){1,63}           # label not: whitespace, mathematical, currency, modifier symbol, control point, punctuation | except -
+                        (?<=[^\-])                                                 # label end, not: -
+                    )*                                                             
+                    \.(?<tld>\w{2,63})                                         # tld
+                )
+            )
+            (?:\:(?<port>\d+))?                                        # port
+            (?<path>                                                   # path, query, fragment
+                [\/?]                                                  # prefixed with \/ or ?
+                [^\s<>]*                                               # any chars except whitespace and <>
+                (?<=[^\s<>({\[`!;:\'".,?«»“”‘’])                       # end with not a space or some punctuation chars
+            )?
+        ' . $suffix . '/ixuJ';
+	}
+
+	/**
+	 * @return string
+	 */
+	public function next_url() {
+		$this->url = null;
+		while ( true ) {
+			$matches = [];
+			$found   = preg_match( $this->regex, $this->text, $matches, PREG_OFFSET_CAPTURE, $this->bytes_already_parsed );
+			if ( 1 !== $found ) {
+				return false;
+			}
+
+			$url = $matches[0][0];
+			if (
+				$url[ strlen( $url ) - 1 ] === ')' ||
+				$url[ strlen( $url ) - 1 ] === '.'
+			) {
+				$url = substr( $url, 0, - 1 );
+			}
+			$this->bytes_already_parsed = $matches[0][1] + strlen( $url );
+
+			if ( ! URL::canParse( $url, $this->base_url ) ) {
+				continue;
+			}
+
+			$this->url = $url;
+			return true;
+		}
+	}
+
+	public function get_url() {
+		if ( null === $this->url ) {
+			return false;
+		}
+
+		return $this->url;
+	}
+
+	public function set_url( $new_url ) {
+
+
+	}
+
+}
+
+
+//public function next_url_2() {
+//	$at = $this->bytes_already_parsed;
+//
+//	// Find the next dot in the text
+//	$dot_at = strpos($this->text, '.', $at);
+//
+//	// If there's no dot, assume there's no URL
+//	if(false === $dot_at) {
+//		return false;
+//	}
+//
+//	// The shortest tld is 2 characters long
+//	if($dot_at + 2 >= strlen($this->text)) {
+//		return false;
+//	}
+//
+//	$host_bytes_after_dot = strcspn(
+//		$this->text,
+//		self::FORBIDDEN_DOMAIN_BYTES,
+//		$dot_at + 1,
+//		self::CONSIDER_DOMAINS_UP_TO_BYTES
+//	);
+//
+//	if(0 === $host_bytes_after_dot) {
+//		return false;
+//	}
+//
+//	// Lookbehind to capture the rest of the domain name up to a forbidden character.
+//	$host_bytes_before_dot = strcspn(
+//		$this->text_rev,
+//		self::FORBIDDEN_DOMAIN_BYTES,
+//		strlen($this->text) - $dot_at - 1,
+//		self::CONSIDER_DOMAINS_UP_TO_BYTES
+//	);
+//
+//	$host_starts_at = $dot_at - $host_bytes_before_dot;
+//
+//	// Capture the protocol, if any
+//	$has_double_slash = false;
+//	if($host_starts_at > 2) {
+//		if ( '/' === $this->text[ $host_starts_at - 1 ] && '/' === $this->text[ $host_starts_at - 2 ] ) {
+//			$has_double_slash = true;
+//		}
+//	}
+//
+//	/**
+//	 * Look for http or https at the beginning of the URL.
+//	 * @TODO: Ensure the character before http or https is a word boundary.
+//	 */
+//	$has_protocol = false;
+//	if($has_double_slash && (
+//			(
+//				$host_starts_at >= 6 &&
+//				'h' === $this->text[$host_starts_at - 6] &&
+//				't' === $this->text[$host_starts_at - 5] &&
+//				't' === $this->text[$host_starts_at - 4] &&
+//				'p' === $this->text[$host_starts_at - 3]
+//			) ||
+//			(
+//				$host_starts_at >= 7 &&
+//				'h' === $this->text[$host_starts_at - 7] &&
+//				't' === $this->text[$host_starts_at - 6] &&
+//				't' === $this->text[$host_starts_at - 5] &&
+//				'p' === $this->text[$host_starts_at - 4] &&
+//				's' === $this->text[$host_starts_at - 3]
+//			)
+//		)) {
+//		$has_protocol = true;
+//	}
+//
+//	// Move the pointer to the end of the host
+//	$at = $dot_at + $host_bytes_after_dot;
+//
+//
+//
+//}