Skip to content

Commit

Permalink
Add: libsyn custom code & data-item JSON processing
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Jun 4, 2024
1 parent dde4e10 commit ae4878c
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 0 deletions.
9 changes: 9 additions & 0 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ func (c *Crawl) extractAssets(base *url.URL, item *frontier.Item, doc *goquery.D
}
}

// Get assets from JSON payloads in data-item values
doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) {
dataItem, exists := item.Attr("data-item")
if exists {
URLsFromJSON, _ := getURLsFromJSON(dataItem)
rawAssets = append(rawAssets, URLsFromJSON...)
}
})

// Check all elements style attributes for background-image & also data-preview
doc.Find("*").Each(func(index int, item *goquery.Selection) {
style, exists := item.Attr("style")
Expand Down
13 changes: 13 additions & 0 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/clbanning/mxj/v2"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/libsyn"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/telegram"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/tiktok"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/truthsocial"
Expand Down Expand Up @@ -276,6 +277,18 @@ func (c *Crawl) Capture(item *frontier.Item) {
}
}
}
} else if libsyn.IsLibsynURL(utils.URLToString(item.URL)) {
// Generate the highwinds URL
highwindsURL, err := libsyn.GenerateHighwindsURL(utils.URLToString(item.URL))
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while generating libsyn URL")
} else {
if highwindsURL == nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while generating libsyn URL")
} else {
c.Capture(frontier.NewItem(highwindsURL, item, item.Type, item.Hop, item.ID, false))
}
}
} else if tiktok.IsTikTokURL(utils.URLToString(item.URL)) {
tiktok.AddHeaders(req)
} else if telegram.IsTelegramURL(utils.URLToString(item.URL)) && !telegram.IsTelegramEmbedURL(utils.URLToString(item.URL)) {
Expand Down
22 changes: 22 additions & 0 deletions internal/pkg/crawl/sitespecific/libsyn/libsyn.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package libsyn

import (
"net/url"
"strings"
)

// Goal is to turn https://traffic.libsyn.com/democratieparticipative/DPS09E16.mp3
// into https://traffic.libsyn.com/secure/force-cdn/highwinds/democratieparticipative/DPS09E16.mp3
// So it's basically adding /secure/force-cdn/highwinds/ after the domain.
func IsLibsynURL(URL string) bool {
return strings.Contains(URL, "traffic.libsyn.com") && strings.HasSuffix(URL, ".mp3")
}

func GenerateHighwindsURL(URL string) (*url.URL, error) {
highwindURL, err := url.Parse(strings.Replace(URL, "traffic.libsyn.com", "traffic.libsyn.com/secure/force-cdn/highwinds", 1))
if err != nil {
return nil, err
}

return highwindURL, nil
}

0 comments on commit ae4878c

Please sign in to comment.