diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index a6d36cdb..2fc9b6d2 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -26,6 +26,15 @@ func (c *Crawl) extractAssets(base *url.URL, item *frontier.Item, doc *goquery.D } } + // Get assets from JSON payloads in data-item values + doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) { + dataItem, exists := item.Attr("data-item") + if exists { + URLsFromJSON, _ := getURLsFromJSON(dataItem) + rawAssets = append(rawAssets, URLsFromJSON...) + } + }) + // Check all elements style attributes for background-image & also data-preview doc.Find("*").Each(func(index int, item *goquery.Selection) { style, exists := item.Attr("style") diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 8da76f17..6a311366 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -14,6 +14,7 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/clbanning/mxj/v2" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" + "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/libsyn" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/telegram" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/tiktok" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/truthsocial" @@ -276,6 +277,18 @@ func (c *Crawl) Capture(item *frontier.Item) { } } } + } else if libsyn.IsLibsynURL(utils.URLToString(item.URL)) { + // Generate the highwinds URL + highwindsURL, err := libsyn.GenerateHighwindsURL(utils.URLToString(item.URL)) + if err != nil { + logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while generating libsyn URL") + } else { + if highwindsURL == nil { + logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while generating libsyn URL") + } else { + c.Capture(frontier.NewItem(highwindsURL, item, item.Type, item.Hop, item.ID, false)) + } + } } else if tiktok.IsTikTokURL(utils.URLToString(item.URL)) { tiktok.AddHeaders(req) } else if telegram.IsTelegramURL(utils.URLToString(item.URL)) && !telegram.IsTelegramEmbedURL(utils.URLToString(item.URL)) { diff --git a/internal/pkg/crawl/sitespecific/libsyn/libsyn.go b/internal/pkg/crawl/sitespecific/libsyn/libsyn.go new file mode 100644 index 00000000..bb6dba2b --- /dev/null +++ b/internal/pkg/crawl/sitespecific/libsyn/libsyn.go @@ -0,0 +1,22 @@ +package libsyn + +import ( + "net/url" + "strings" +) + +// Goal is to turn https://traffic.libsyn.com/democratieparticipative/DPS09E16.mp3 +// into https://traffic.libsyn.com/secure/force-cdn/highwinds/democratieparticipative/DPS09E16.mp3 +// So it's basically adding /secure/force-cdn/highwinds/ after the domain. +func IsLibsynURL(URL string) bool { + return strings.Contains(URL, "traffic.libsyn.com") && strings.HasSuffix(URL, ".mp3") +} + +func GenerateHighwindsURL(URL string) (*url.URL, error) { + highwindURL, err := url.Parse(strings.Replace(URL, "traffic.libsyn.com", "traffic.libsyn.com/secure/force-cdn/highwinds", 1)) + if err != nil { + return nil, err + } + + return highwindURL, nil +}