Skip to content

Commit

Permalink
fix: add seencheck for m3u8 processing
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 28, 2024
1 parent 8969b3c commit ca6cbf3
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers m
if extractor.IsM3U8(resp) {
assets, err := extractor.M3U8(resp)
if err == nil {
c.captureAssets(item, assets, cookies, headers)
assets = c.seencheckAssets(assets, item)
if len(assets) != 0 {
c.captureAssets(item, assets, cookies, headers)
}
} else {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8")
}
Expand Down Expand Up @@ -309,7 +312,8 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
if scriptType == "application/json" {
URLsFromJSON, err := extractor.GetURLsFromJSON([]byte(item.Text()))
if err != nil {
c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
// TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed
// c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, URLsFromJSON...)
}
Expand Down Expand Up @@ -367,7 +371,8 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
if len(jsonContent[1]) > payloadEndPosition {
URLsFromJSON, err := extractor.GetURLsFromJSON([]byte(jsonContent[1][:payloadEndPosition+1]))
if err != nil {
c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
// TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed
// c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, URLsFromJSON...)
}
Expand Down

0 comments on commit ca6cbf3

Please sign in to comment.