From c118ab1257d632b8cdc02cbf375a241feae8f31a Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Wed, 18 Dec 2024 17:00:43 +0000 Subject: [PATCH 1/3] CDPT-1325 WP CLI command to export a CSV of an agency's documents with posts where the document is referenced. --- public/app/themes/clarity/functions.php | 1 + .../clarity/inc/commands/FindDocumentRefs.php | 147 ++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 public/app/themes/clarity/inc/commands/FindDocumentRefs.php diff --git a/public/app/themes/clarity/functions.php b/public/app/themes/clarity/functions.php index 969eecf0b..5f03e1e81 100644 --- a/public/app/themes/clarity/functions.php +++ b/public/app/themes/clarity/functions.php @@ -12,6 +12,7 @@ if (defined('WP_CLI') && WP_CLI) { require_once 'inc/commands/DocumentRevisionReconcile.php'; + require_once 'inc/commands/FindDocumentRefs.php'; require_once 'inc/commands/SyncUserRoles.php'; } diff --git a/public/app/themes/clarity/inc/commands/FindDocumentRefs.php b/public/app/themes/clarity/inc/commands/FindDocumentRefs.php new file mode 100644 index 000000000..360f8be1d --- /dev/null +++ b/public/app/themes/clarity/inc/commands/FindDocumentRefs.php @@ -0,0 +1,147 @@ + + * + * Example: + * wp find_doc_refs hmcts + */ + +if (defined('WP_CLI') && WP_CLI) { + class FindDocumentRefs + { + public function __invoke($args, $assoc_args) + { + $agency = $args[0]; + + if (!$agency) { + WP_CLI::error('Please provide an agency slug.'); + return; + } + + global $wpdb; + + /** + * Search the database for document references in post content and metadata. + */ + + $results = $wpdb->get_results(" + SELECT ID, post_content AS content, 'post' AS type + FROM wp_posts + WHERE 1=1 + AND post_status = 'publish' + AND post_content LIKE '%/documents/____/__/%' + + UNION ALL + + SELECT post_id AS ID, meta_value AS content, 'metadata' AS type + FROM wp_postmeta + WHERE 1=1 + AND meta_value LIKE '%/documents/____/__/%' + AND post_id IN ( + SELECT ID + FROM wp_posts + WHERE 1=1 + AND post_status = 'publish' + ) + "); + + // Init an array to store the documents and their references. + $documents = []; + + foreach ($results as $result) { + if ($result->type === 'post') { + WP_CLI::line("Processing Post: {$result->ID}"); + } else { + WP_CLI::line("Processing Metadata for Post: {$result->ID}"); + } + + // Split the post content at the /documents/____/__/ + $parts = explode('/documents/', $result->content); + + // Remove the first part. + array_shift($parts); + + // Map over the parts to clean them up. + foreach ($parts as $part) { + // Delete everything after the first quote. + $part = explode(' ', $part)[0]; + $part = explode('"', $part)[0]; + $part = explode('\'', $part)[0]; + $part = explode(')', $part)[0]; + + // Get the document ID + $document_id = url_to_postid('/documents/' . rtrim($part, '/')); + + if (!$document_id) { + continue; + } + + // Get the document's agency + $terms = get_the_terms($document_id, 'agency'); + + if (empty($terms)) { + continue; + } + + // Filter the terms by the agency + $filtered_terms = array_filter($terms, function ($term) use ($agency) { + return $term->slug === $agency; + }); + + if (empty($filtered_terms)) { + continue; + } + + $post_url = get_permalink($result->ID); + + // If we're running locally, replace the URL with the production URL. + if (get_home_url() === 'http://intranet.docker') { + $post_url = str_replace('http://intranet.docker', 'https://intranet.justice.gov.uk', get_permalink($result->ID)); + } + + // If the document doesn't exist in the documents array, add it. + if (!isset($documents[$document_id])) { + $documents[$document_id] = [ + 'document_id' => $document_id, + 'links' => [$post_url] + ]; + continue; + } + + // If the document exists in the documents array, add the post URL to the links array. + if ( + !in_array($post_url, $documents[$document_id]['links'], true) + ) { + $documents[$document_id]['links'][] = $post_url; + } + } + } + + // Sort the documents by document_id + uasort($documents, static function ($a, $b) { + return $b['document_id'] <=> $a['document_id']; + }); + + // Flatten the documents array + $documents_flat = array_map(static function ($document) { + return [ + $document['document_id'], + ...$document['links'] + ]; + }, $documents); + + // Write the CSV + $fd = fopen("/var/www/html/tmp/{$agency}_document_references.csv", 'w'); + WP_CLI\Utils\write_csv($fd, $documents_flat); + fclose($fd); + } + } + + WP_CLI::add_command('find_doc_refs', 'FindDocumentRefs'); +} From e88e1568fd077c3f0b9d7d14a12db3aaa550afde Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:50:38 +0000 Subject: [PATCH 2/3] Add mechanism to export broken links. --- .../clarity/inc/commands/FindDocumentRefs.php | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/public/app/themes/clarity/inc/commands/FindDocumentRefs.php b/public/app/themes/clarity/inc/commands/FindDocumentRefs.php index 360f8be1d..dd8091a29 100644 --- a/public/app/themes/clarity/inc/commands/FindDocumentRefs.php +++ b/public/app/themes/clarity/inc/commands/FindDocumentRefs.php @@ -54,6 +54,9 @@ public function __invoke($args, $assoc_args) // Init an array to store the documents and their references. $documents = []; + // Broken document links + $broken_links = []; + foreach ($results as $result) { if ($result->type === 'post') { WP_CLI::line("Processing Post: {$result->ID}"); @@ -74,11 +77,41 @@ public function __invoke($args, $assoc_args) $part = explode('"', $part)[0]; $part = explode('\'', $part)[0]; $part = explode(')', $part)[0]; + $part = explode("\n", $part)[0]; // 2211 before this, 2194 after + + // Trim all white space + $part = trim($part); // 2194 before this, 2187 after // Get the document ID $document_id = url_to_postid('/documents/' . rtrim($part, '/')); if (!$document_id) { + // Log the $part, so we can try and manually find it. + WP_CLI::line("Could not find document ID for: {$part}"); + + // Get the agencies for this post + $terms = get_the_terms($result->ID, 'agency'); + + // If there is only one agency and the slug is the same as the agency we're looking for, log the part. + if (is_array($terms) && count($terms) === 1 && $terms[0]->slug === $agency) { + WP_CLI::line("Document link not found: {$part}"); + WP_CLI::line("On {$agency} page: " . get_permalink($result->ID)); + + $post_url = get_permalink($result->ID); + + // If we're running locally, replace the URL with the production URL. + if (get_home_url() === 'http://intranet.docker') { + $post_url = str_replace('http://intranet.docker', 'https://intranet.justice.gov.uk', get_permalink($result->ID)); + } + + $broken_links[] = [ + 'location_id' => $result->ID, + 'location' => $post_url, + 'link' => $part, + 'post_type' => get_post_type($result->ID) + ]; + } + continue; } @@ -123,6 +156,10 @@ public function __invoke($args, $assoc_args) } } + /** + * Document references + */ + // Sort the documents by document_id uasort($documents, static function ($a, $b) { return $b['document_id'] <=> $a['document_id']; @@ -140,6 +177,20 @@ public function __invoke($args, $assoc_args) $fd = fopen("/var/www/html/tmp/{$agency}_document_references.csv", 'w'); WP_CLI\Utils\write_csv($fd, $documents_flat); fclose($fd); + + /** + * Broken links + */ + + // Sort the broken links by location + uasort($broken_links, static function ($a, $b) { + return $b['location_id'] <=> $a['location_id']; + }); + + // Write the CSV + $fd = fopen("/var/www/html/tmp/{$agency}_broken_links.csv", 'w'); + WP_CLI\Utils\write_csv($fd, $broken_links); + fclose($fd); } } From 94e4807d032591ed5b21afc693369a5c46ccfcbe Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Fri, 20 Dec 2024 14:28:43 +0000 Subject: [PATCH 3/3] Update FindDocumentRefs.php --- .../clarity/inc/commands/FindDocumentRefs.php | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/public/app/themes/clarity/inc/commands/FindDocumentRefs.php b/public/app/themes/clarity/inc/commands/FindDocumentRefs.php index dd8091a29..111dfda6e 100644 --- a/public/app/themes/clarity/inc/commands/FindDocumentRefs.php +++ b/public/app/themes/clarity/inc/commands/FindDocumentRefs.php @@ -77,7 +77,11 @@ public function __invoke($args, $assoc_args) $part = explode('"', $part)[0]; $part = explode('\'', $part)[0]; $part = explode(')', $part)[0]; - $part = explode("\n", $part)[0]; // 2211 before this, 2194 after + $part = explode("\n", $part)[0]; + $part = explode("<", $part)[0]; + $part = explode("?", $part)[0]; + $part = explode("#", $part)[0]; + $part = explode(">", $part)[0]; // Trim all white space $part = trim($part); // 2194 before this, 2187 after @@ -85,6 +89,13 @@ public function __invoke($args, $assoc_args) // Get the document ID $document_id = url_to_postid('/documents/' . rtrim($part, '/')); + // Try and get the document by removing the date from the URL + if (!$document_id) { + $document_slug = end(explode('/', $part)); + + $document_id = url_to_postid('/documents/' . $document_slug); + } + if (!$document_id) { // Log the $part, so we can try and manually find it. WP_CLI::line("Could not find document ID for: {$part}"); @@ -104,12 +115,14 @@ public function __invoke($args, $assoc_args) $post_url = str_replace('http://intranet.docker', 'https://intranet.justice.gov.uk', get_permalink($result->ID)); } - $broken_links[] = [ - 'location_id' => $result->ID, - 'location' => $post_url, - 'link' => $part, - 'post_type' => get_post_type($result->ID) - ]; + if (!isset($broken_links[$part])) { + $broken_links[$part] = [ + 'locations' => [$post_url], + 'link' => 'https://intranet.justice.gov.uk/documents/' . $part, + ]; + } else { + $broken_links[$part]['locations'][] = $post_url; + } } continue; @@ -141,6 +154,7 @@ public function __invoke($args, $assoc_args) // If the document doesn't exist in the documents array, add it. if (!isset($documents[$document_id])) { $documents[$document_id] = [ + 'title' => get_the_title($document_id), 'document_id' => $document_id, 'links' => [$post_url] ]; @@ -169,6 +183,7 @@ public function __invoke($args, $assoc_args) $documents_flat = array_map(static function ($document) { return [ $document['document_id'], + $document['title'], ...$document['links'] ]; }, $documents); @@ -184,12 +199,19 @@ public function __invoke($args, $assoc_args) // Sort the broken links by location uasort($broken_links, static function ($a, $b) { - return $b['location_id'] <=> $a['location_id']; + return $b['link'] <=> $a['link']; }); + $broken_links_flat = array_map(static function ($link) { + return [ + $link['link'], + ...$link['locations'] + ]; + }, $broken_links); + // Write the CSV $fd = fopen("/var/www/html/tmp/{$agency}_broken_links.csv", 'w'); - WP_CLI\Utils\write_csv($fd, $broken_links); + WP_CLI\Utils\write_csv($fd, $broken_links_flat); fclose($fd); } }