-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
CDPT-1325 WP CLI command to export an agency's documents with URLs wh…
…ere the document is referenced. (#801) * CDPT-1325 WP CLI command to export a CSV of an agency's documents with posts where the document is referenced. * Add mechanism to export broken links. * Update FindDocumentRefs.php
- Loading branch information
1 parent
dd4cb9d
commit 842121b
Showing
2 changed files
with
221 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
220 changes: 220 additions & 0 deletions
220
public/app/themes/clarity/inc/commands/FindDocumentRefs.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
<?php | ||
|
||
/** | ||
* FindDocumentRefs | ||
* | ||
* This command will search for document references in post content and metadata. | ||
* | ||
* Usage: | ||
* wp find_doc_refs <agency_slug> | ||
* | ||
* Example: | ||
* wp find_doc_refs hmcts | ||
*/ | ||
|
||
if (defined('WP_CLI') && WP_CLI) { | ||
class FindDocumentRefs | ||
{ | ||
public function __invoke($args, $assoc_args) | ||
{ | ||
$agency = $args[0]; | ||
|
||
if (!$agency) { | ||
WP_CLI::error('Please provide an agency slug.'); | ||
return; | ||
} | ||
|
||
global $wpdb; | ||
|
||
/** | ||
* Search the database for document references in post content and metadata. | ||
*/ | ||
|
||
$results = $wpdb->get_results(" | ||
SELECT ID, post_content AS content, 'post' AS type | ||
FROM wp_posts | ||
WHERE 1=1 | ||
AND post_status = 'publish' | ||
AND post_content LIKE '%/documents/____/__/%' | ||
UNION ALL | ||
SELECT post_id AS ID, meta_value AS content, 'metadata' AS type | ||
FROM wp_postmeta | ||
WHERE 1=1 | ||
AND meta_value LIKE '%/documents/____/__/%' | ||
AND post_id IN ( | ||
SELECT ID | ||
FROM wp_posts | ||
WHERE 1=1 | ||
AND post_status = 'publish' | ||
) | ||
"); | ||
|
||
// Init an array to store the documents and their references. | ||
$documents = []; | ||
|
||
// Broken document links | ||
$broken_links = []; | ||
|
||
foreach ($results as $result) { | ||
if ($result->type === 'post') { | ||
WP_CLI::line("Processing Post: {$result->ID}"); | ||
} else { | ||
WP_CLI::line("Processing Metadata for Post: {$result->ID}"); | ||
} | ||
|
||
// Split the post content at the /documents/____/__/ | ||
$parts = explode('/documents/', $result->content); | ||
|
||
// Remove the first part. | ||
array_shift($parts); | ||
|
||
// Map over the parts to clean them up. | ||
foreach ($parts as $part) { | ||
// Delete everything after the first quote. | ||
$part = explode(' ', $part)[0]; | ||
$part = explode('"', $part)[0]; | ||
$part = explode('\'', $part)[0]; | ||
$part = explode(')', $part)[0]; | ||
$part = explode("\n", $part)[0]; | ||
$part = explode("<", $part)[0]; | ||
$part = explode("?", $part)[0]; | ||
$part = explode("#", $part)[0]; | ||
$part = explode(">", $part)[0]; | ||
|
||
// Trim all white space | ||
$part = trim($part); // 2194 before this, 2187 after | ||
|
||
// Get the document ID | ||
$document_id = url_to_postid('/documents/' . rtrim($part, '/')); | ||
|
||
// Try and get the document by removing the date from the URL | ||
if (!$document_id) { | ||
$document_slug = end(explode('/', $part)); | ||
|
||
$document_id = url_to_postid('/documents/' . $document_slug); | ||
} | ||
|
||
if (!$document_id) { | ||
// Log the $part, so we can try and manually find it. | ||
WP_CLI::line("Could not find document ID for: {$part}"); | ||
|
||
// Get the agencies for this post | ||
$terms = get_the_terms($result->ID, 'agency'); | ||
|
||
// If there is only one agency and the slug is the same as the agency we're looking for, log the part. | ||
if (is_array($terms) && count($terms) === 1 && $terms[0]->slug === $agency) { | ||
WP_CLI::line("Document link not found: {$part}"); | ||
WP_CLI::line("On {$agency} page: " . get_permalink($result->ID)); | ||
|
||
$post_url = get_permalink($result->ID); | ||
|
||
// If we're running locally, replace the URL with the production URL. | ||
if (get_home_url() === 'http://intranet.docker') { | ||
$post_url = str_replace('http://intranet.docker', 'https://intranet.justice.gov.uk', get_permalink($result->ID)); | ||
} | ||
|
||
if (!isset($broken_links[$part])) { | ||
$broken_links[$part] = [ | ||
'locations' => [$post_url], | ||
'link' => 'https://intranet.justice.gov.uk/documents/' . $part, | ||
]; | ||
} else { | ||
$broken_links[$part]['locations'][] = $post_url; | ||
} | ||
} | ||
|
||
continue; | ||
} | ||
|
||
// Get the document's agency | ||
$terms = get_the_terms($document_id, 'agency'); | ||
|
||
if (empty($terms)) { | ||
continue; | ||
} | ||
|
||
// Filter the terms by the agency | ||
$filtered_terms = array_filter($terms, function ($term) use ($agency) { | ||
return $term->slug === $agency; | ||
}); | ||
|
||
if (empty($filtered_terms)) { | ||
continue; | ||
} | ||
|
||
$post_url = get_permalink($result->ID); | ||
|
||
// If we're running locally, replace the URL with the production URL. | ||
if (get_home_url() === 'http://intranet.docker') { | ||
$post_url = str_replace('http://intranet.docker', 'https://intranet.justice.gov.uk', get_permalink($result->ID)); | ||
} | ||
|
||
// If the document doesn't exist in the documents array, add it. | ||
if (!isset($documents[$document_id])) { | ||
$documents[$document_id] = [ | ||
'title' => get_the_title($document_id), | ||
'document_id' => $document_id, | ||
'links' => [$post_url] | ||
]; | ||
continue; | ||
} | ||
|
||
// If the document exists in the documents array, add the post URL to the links array. | ||
if ( | ||
!in_array($post_url, $documents[$document_id]['links'], true) | ||
) { | ||
$documents[$document_id]['links'][] = $post_url; | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Document references | ||
*/ | ||
|
||
// Sort the documents by document_id | ||
uasort($documents, static function ($a, $b) { | ||
return $b['document_id'] <=> $a['document_id']; | ||
}); | ||
|
||
// Flatten the documents array | ||
$documents_flat = array_map(static function ($document) { | ||
return [ | ||
$document['document_id'], | ||
$document['title'], | ||
...$document['links'] | ||
]; | ||
}, $documents); | ||
|
||
// Write the CSV | ||
$fd = fopen("/var/www/html/tmp/{$agency}_document_references.csv", 'w'); | ||
WP_CLI\Utils\write_csv($fd, $documents_flat); | ||
fclose($fd); | ||
|
||
/** | ||
* Broken links | ||
*/ | ||
|
||
// Sort the broken links by location | ||
uasort($broken_links, static function ($a, $b) { | ||
return $b['link'] <=> $a['link']; | ||
}); | ||
|
||
$broken_links_flat = array_map(static function ($link) { | ||
return [ | ||
$link['link'], | ||
...$link['locations'] | ||
]; | ||
}, $broken_links); | ||
|
||
// Write the CSV | ||
$fd = fopen("/var/www/html/tmp/{$agency}_broken_links.csv", 'w'); | ||
WP_CLI\Utils\write_csv($fd, $broken_links_flat); | ||
fclose($fd); | ||
} | ||
} | ||
|
||
WP_CLI::add_command('find_doc_refs', 'FindDocumentRefs'); | ||
} |