diff --git a/docs/_importers/wordpressdotcom.md b/docs/_importers/wordpressdotcom.md
index 296ad579..11a23a5e 100644
--- a/docs/_importers/wordpressdotcom.md
+++ b/docs/_importers/wordpressdotcom.md
@@ -17,10 +17,9 @@ Their default values are what you see above.
### Further WordPress migration alternatives
-While the above method works, it does not import much of the metadata that is
-usually stored in WordPress posts and pages. If you need to export things like
-pages, tags, custom fields, image attachments and so on, the following resources
-might be useful to you:
+While the above method works, it doesn't import absolutely every piece of
+metadata. If you need to import custom fields from your pages and posts,
+the following resources might be useful to you:
- [Exitwp](https://github.com/thomasf/exitwp) is a configurable tool written in
Python for migrating one or more WordPress blogs into Jekyll (Markdown) format
diff --git a/jekyll-import.gemspec b/jekyll-import.gemspec
index 76917a9f..3e23db17 100644
--- a/jekyll-import.gemspec
+++ b/jekyll-import.gemspec
@@ -51,7 +51,6 @@ Gem::Specification.new do |s|
# importer dependencies:
# s.add_development_dependency("behance", "~> 0.3") # uses outdated dependencies
- s.add_development_dependency("hpricot", "~> 0.8")
s.add_development_dependency("htmlentities", "~> 4.3")
s.add_development_dependency("mysql2", "~> 0.3")
s.add_development_dependency("open_uri_redirections", "~> 0.2")
diff --git a/lib/jekyll-import/importers/wordpressdotcom.rb b/lib/jekyll-import/importers/wordpressdotcom.rb
index 230525d7..8babf38a 100644
--- a/lib/jekyll-import/importers/wordpressdotcom.rb
+++ b/lib/jekyll-import/importers/wordpressdotcom.rb
@@ -8,7 +8,7 @@ def self.require_deps
rubygems
fileutils
safe_yaml
- hpricot
+ nokogiri
time
open-uri
open_uri_redirections
@@ -22,16 +22,16 @@ def self.specify_options(c)
end
# Will modify post DOM tree
- def self.download_images(title, post_hpricot, assets_folder)
- images = (post_hpricot / "img")
+ def self.download_images(title, post_doc, assets_folder)
+ images = post_doc.css("img")
return if images.empty?
- Jekyll.logger.info "Downloading images for ", title
+ Jekyll.logger.info "Downloading:", "images for #{title}"
images.each do |i|
uri = URI::DEFAULT_PARSER.escape(i["src"])
dst = File.join(assets_folder, File.basename(uri))
- i["src"] = File.join("{{ site.baseurl }}", dst)
+ i["src"] = File.join("{{site.baseurl}}", dst)
Jekyll.logger.info uri
if File.exist?(dst)
Jekyll.logger.info "Already in cache. Clean assets folder if you want a redownload."
@@ -54,15 +54,18 @@ def self.download_images(title, post_hpricot, assets_folder)
class Item
def initialize(node)
+ raise "Node is nil" if node.nil?
+
@node = node
end
def text_for(path)
- @node.at(path).inner_text
+ subnode = @node.at_xpath("./#{path}") || @node.at(path) || @node.children.find { |child| child.name == path }
+ subnode.text
end
def title
- @title ||= text_for(:title).strip
+ @title ||= text_for("title").strip
end
def permalink_title
@@ -76,12 +79,10 @@ def permalink_title
end
def permalink
- # Hpricot thinks "link" is a self closing tag so it puts the text of the link after the tag
- # but sometimes it works right! I think it's the xml declaration
@permalink ||= begin
uri = text_for("link")
- uri = @node.at("link").following[0] if uri.empty?
- URI(uri.to_s).path
+ uri = @node.at("link").next_sibling.text if uri.empty?
+ URI(uri.to_s.strip).path
end
end
@@ -127,12 +128,8 @@ def published?
def excerpt
@excerpt ||= begin
- text = Hpricot(text_for("excerpt:encoded")).inner_text
- if text.empty?
- nil
- else
- text
- end
+ text = Nokogiri::HTML(text_for("excerpt:encoded")).text
+ text.empty? ? nil : text
end
end
end
@@ -144,29 +141,32 @@ def self.process(options)
FileUtils.mkdir_p(assets_folder)
import_count = Hash.new(0)
- doc = Hpricot::XML(File.read(source))
+ doc = Nokogiri::XML(File.read(source))
# Fetch authors data from header
authors = Hash[
- (doc / :channel / "wp:author").map do |author|
- [author.at("wp:author_login").inner_text.strip, {
- "login" => author.at("wp:author_login").inner_text.strip,
- "email" => author.at("wp:author_email").inner_text,
- "display_name" => author.at("wp:author_display_name").inner_text,
- "first_name" => author.at("wp:author_first_name").inner_text,
- "last_name" => author.at("wp:author_last_name").inner_text,
- },]
+ doc.xpath("//channel/wp:author").map do |author|
+ [
+ author.xpath("./wp:author_login").text.strip,
+ {
+ "login" => author.xpath("./wp:author_login").text.strip,
+ "email" => author.xpath("./wp:author_email").text,
+ "display_name" => author.xpath("./wp:author_display_name").text,
+ "first_name" => author.xpath("./wp:author_first_name").text,
+ "last_name" => author.xpath("./wp:author_last_name").text,
+ },
+ ]
end
] rescue {}
- (doc / :channel / :item).each do |node|
+ doc.css("channel > item").each do |node|
item = Item.new(node)
- categories = node.search('category[@domain="category"]').map(&:inner_text).reject { |c| c == "Uncategorized" }.uniq
- tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
+ categories = node.css('category[domain="category"]').map(&:text).reject { |c| c == "Uncategorized" }.uniq
+ tags = node.css('category[domain="post_tag"]').map(&:text).uniq
metas = {}
- node.search("wp:postmeta").each do |meta|
- key = meta.at("wp:meta_key").inner_text
- value = meta.at("wp:meta_value").inner_text
+ node.xpath("./wp:postmeta").each do |meta|
+ key = meta.at_xpath("./wp:meta_key").text
+ value = meta.at_xpath("./wp:meta_value").text
metas[key] = value
end
@@ -189,7 +189,7 @@ def self.process(options)
}
begin
- content = Hpricot(item.text_for("content:encoded"))
+ content = Nokogiri::HTML(item.text_for("content:encoded"))
header["excerpt"] = item.excerpt if item.excerpt
if fetch
@@ -221,7 +221,7 @@ def self.process(options)
end
import_count.each do |key, value|
- Jekyll.logger.info "Imported #{value} #{key}s"
+ Jekyll.logger.info "Imported", "#{value} #{Util.pluralize(key, value)}"
end
end
diff --git a/lib/jekyll-import/util.rb b/lib/jekyll-import/util.rb
index 2a6e8b3e..c8865742 100644
--- a/lib/jekyll-import/util.rb
+++ b/lib/jekyll-import/util.rb
@@ -73,5 +73,13 @@ def self.wpautop(pee, br = true)
end
pee
end
+
+ def self.pluralize(word, count)
+ return word if count <= 1
+
+ return word if word.end_with?("s")
+
+ "#{word}s"
+ end
end
end
diff --git a/test/mocks/sitetitle.wordpress.2025-01-19.000.xml b/test/mocks/sitetitle.wordpress.2025-01-19.000.xml
new file mode 100644
index 00000000..97c9b6ce
--- /dev/null
+++ b/test/mocks/sitetitle.wordpress.2025-01-19.000.xml
@@ -0,0 +1,871 @@
+
+
+
+
+
+ My Site Title
+ https://jekyllbot.wordpress.com
+
+ Sun, 19 Jan 2025 02:41:16 +0000
+ en
+ 1.2
+ http://wordpress.com/
+ https://jekyllbot.wordpress.com
+
+ 12832738
+ jekyllbot
+ jekyllbot@gmail.com
+
+
+
+
+
+ 7202
+ foo
+
+
+
+
+ 1
+ uncategorized
+
+
+
+
+ 656
+ code
+
+
+
+ 4872
+ ruby
+
+
+
+ 69750
+ ship
+
+
+
+ 19214
+ stars
+
+
+ http://wordpress.com/
+
+ http://s0.wp.com/i/buttonw-com.png
+ My Site Title
+ https://jekyllbot.wordpress.com
+
+
+
+ https://jekyllbot.wordpress.com/about/
+ Sun, 19 Jan 2025 02:31:19 +0000
+ jekyllbot
+ http://jekyllbot.wordpress.com/?page_id=1
+
+
+
This is an example of a page. Unlike posts, which are displayed on your blog’s front page in the order they’re published, pages are better suited for more timeless content that you want to be easily accessible, like your About or Contact information. Click the Edit link to make changes to this page or add another page.
Welcome to WordPress! This is a sample post. Edit or delete it to take the first step in your blogging journey. To add more content here, click the small plus icon at the top left corner. There, you will find an existing selection of WordPress blocks and patterns, something to suit your every need for content creation. And don’t forget to check out the List View: click the icon a few spots to the right of the plus icon and you’ll get a tidy, easy-to-view list of the blocks and patterns in your post.
Welcome to WordPress! This is a sample post. Edit or delete it to take the first step in your blogging journey. To add more content here, click the small plus icon at the top left corner. There, you will find an existing selection of WordPress blocks and patterns, something to suit your every need for content creation. And don’t forget to check out the List View: click the icon a few spots to the right of the plus icon and you’ll get a tidy, easy-to-view list of the blocks and patterns in your post.
Welcome to WordPress! This is a sample post. Edit or delete it to take the first step in your blogging journey. To add more content here, click the small plus icon at the top left corner. There, you will find an existing selection of WordPress blocks and patterns, something to suit your every need for content creation. And don’t forget to check out the List View: click the icon a few spots to the right of the plus icon and you’ll get a tidy, easy-to-view list of the blocks and patterns in your post.
Welcome to WordPress! This is a sample post. Edit or delete it to take the first step in your blogging journey. To add more content here, click the small plus icon at the top left corner. There, you will find an existing selection of WordPress blocks and patterns, something to suit your every need for content creation. And don’t forget to check out the List View: click the icon a few spots to the right of the plus icon and you’ll get a tidy, easy-to-view list of the blocks and patterns in your post.
Welcome to WordPress! This is a sample post. Edit or delete it to take the first step in your blogging journey. To add more content here, click the small plus icon at the top left corner. There, you will find an existing selection of WordPress blocks and patterns, something to suit your every need for content creation. And don’t forget to check out the List View: click the icon a few spots to the right of the plus icon and you’ll get a tidy, easy-to-view list of the blocks and patterns in your post.
Welcome to WordPress! This is a sample post. Edit or delete it to take the first step in your blogging journey. To add more content here, click the small plus icon at the top left corner. There, you will find an existing selection of WordPress blocks and patterns, something to suit your every need for content creation. And don’t forget to check out the List View: click the icon a few spots to the right of the plus icon and you’ll get a tidy, easy-to-view list of the blocks and patterns in your post.
What kind of things would you share in your first blog post?
+
+
+
+
Perhaps an image:
+
+
+
+
+
+
+
+
Maybe some code:
+
+
+
+
query = "Generate an image of a night sky above the ocean with a wooden fishing ship on the water."
+puts "Executing query: #{query}"
+puts AIBot.new(query).execute.result
+
+
+
+
Or, perhaps you'd dream up something else entirely. What would it be?
query = "Generate an image of a night sky above the ocean with a wooden fishing ship on the water."
+puts "Executing query: \#{query}"
+puts AIBot.new(query).execute.result
+HTML
+
+ assert_path_exist File.expand_path("_posts/2025-01-18-the-art-of-connection.html", tmpdir)
+ post_content = File.read(File.expand_path("_posts/2025-01-18-the-art-of-connection.html", tmpdir))
+ post_front_matter = post_content.match(/^(---\n.*?---\n)/m)[0]
+ post_data = YAML.safe_load(post_front_matter)
+ assert_equal "In the ever-evolving world, the art of forging genuine connections remains timeless. Whether it’s with colleagues, clients, or partners, establishing a genuine rapport paves the way for collaborative success.", post_data["excerpt"]
+
+ # Assert all posts are imported.
+ [
+ "2025-01-18-adaptive-advantage.html",
+ "2025-01-18-collaboration-magic.html",
+ "2025-01-18-beyond-the-obstacle.html",
+ "2025-01-18-growth-unlocked.html",
+ "2025-01-18-teamwork-triumphs.html"
+ ].each do |post_slug|
+ assert_path_exist File.expand_path("_posts/"+post_slug, tmpdir)
+ end
+ end
end
class TestWordpressDotComItem < Test::Unit::TestCase
should "extract an item's title" do
- node = Hpricot('
+ node = Nokogiri::XML('
Dear Science
- ').at("item")
+ ').at_css("item")
item = Importers::WordpressDotCom::Item.new(node)
assert_equal("Dear Science", item.title)
end
should "use post_name for the permalink_title if it's there" do
- node = Hpricot('
+ node = Nokogiri::XML('
+ cookie-mountainDear Science
- ').at("item")
+
+ ').at_css("item")
item = Importers::WordpressDotCom::Item.new(node)
assert_equal("cookie-mountain", item.permalink_title)
end
should "sluggify title for the permalink_title if post_name is empty" do
- node = Hpricot('
+ node = Nokogiri::XML('
+ Dear Science
- ').at("item")
+
+ ').at_css("item")
item = Importers::WordpressDotCom::Item.new(node)
assert_equal("dear-science", item.permalink_title)
end
should "return nil for the excerpt, if it's missing" do
- node = Hpricot('
+ node = Nokogiri::XML('
+
- ').at("item")
+
+ ').at_css("item")
item = Importers::WordpressDotCom::Item.new(node)
assert_equal(nil, item.excerpt)
end
should "extract the excerpt as plaintext, if it's present" do
- node = Hpricot('
+ node = Nokogiri::XML('
+
- ').at("item")
+
+ ').at_css("item")
item = Importers::WordpressDotCom::Item.new(node)
assert_equal("...this one weird trick.", item.excerpt)
@@ -65,7 +166,8 @@ class TestWordpressDotComItem < Test::Unit::TestCase
class TestWordpressDotComPublishedItem < TestWordpressDotComItem
def node
- Hpricot('
+ Nokogiri::XML('
+ PostTitle
https://www.example.com/post/123/post-title/
@@ -73,7 +175,8 @@ def node
postpublish2015-01-23 08:53:47
- ').at("item")
+
+ ').at("item")
end
def item
@@ -107,13 +210,15 @@ def item
class TestWordpressDotComDraftItem < TestWordpressDotComItem
def node
- Hpricot('
+ Nokogiri::XML('
+ post-namepostdraft0000-00-00 00:00:00
- ').at("item")
+
+ ').at_css("item")
end
def item