From cfa298060090e8d6c78662c3e01479df6713e720 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Thu, 12 Sep 2024 16:54:08 +0200 Subject: [PATCH] Add proper YouTube archiving via YT-DLP (#126) * add: yt-dlp support to gather YouTube URLs from watch pages * [site/yt] add: format selection & metadata record * [ext/m3u8] initial commit * fix: remove default global HTTP timeout * [site/yt] wip: fix tests * chores: small refactoring * [site/yt] fix test * ytdlp: remove useless subtitles parsing function * m3u8: handle content-type case insensitively * chore: small refactoring * ytdlp: add dubbed audio streams * ytdlp: format selection & refactoring --- cmd/get.go | 6 +- config/config.go | 4 + go.mod | 1 + go.sum | 6 + internal/pkg/crawl/assets.go | 144 ++++++++++++++++-- internal/pkg/crawl/capture.go | 139 +++++------------ internal/pkg/crawl/config.go | 12 +- internal/pkg/crawl/crawl.go | 27 +++- .../pkg/crawl/dependencies/ytdlp/model.go | 102 +++++++++++++ .../pkg/crawl/dependencies/ytdlp/server.go | 46 ++++++ .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 80 ++++++++++ internal/pkg/crawl/exclusion.go | 44 ++++++ internal/pkg/crawl/extractor/m3u8.go | 49 ++++++ internal/pkg/crawl/extractor/utils.go | 9 ++ internal/pkg/crawl/finish.go | 2 +- internal/pkg/crawl/outlinks.go | 34 +---- .../pkg/crawl/sitespecific/youtube/youtube.go | 40 +++++ .../sitespecific/youtube/youtube_test.go | 32 ++++ .../sitespecific/youtube/youtube_test.html | 88 +++++++++++ internal/pkg/crawl/utils.go | 21 --- internal/pkg/crawl/worker.go | 2 +- 21 files changed, 714 insertions(+), 174 deletions(-) create mode 100644 internal/pkg/crawl/dependencies/ytdlp/model.go create mode 100644 internal/pkg/crawl/dependencies/ytdlp/server.go create mode 100644 internal/pkg/crawl/dependencies/ytdlp/ytdlp.go create mode 100644 internal/pkg/crawl/exclusion.go create mode 100644 internal/pkg/crawl/extractor/m3u8.go create mode 100644 internal/pkg/crawl/sitespecific/youtube/youtube.go create mode 100644 internal/pkg/crawl/sitespecific/youtube/youtube_test.go create mode 100644 internal/pkg/crawl/sitespecific/youtube/youtube_test.html diff --git a/cmd/get.go b/cmd/get.go index 8df1a470..03a92867 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -43,7 +43,7 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.") getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.") getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.") - getCmd.PersistentFlags().Int("http-timeout", 30, "Number of seconds to wait before timing out a request.") + getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.") getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.") getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from") getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.") @@ -84,6 +84,10 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().String("es-password", "", "ElasticSearch password to use for indexing crawl logs.") getCmd.PersistentFlags().String("es-index-prefix", "zeno", "ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`") + // Dependencies flags + getCmd.PersistentFlags().Bool("no-ytdlp", false, "Disable youtube-dlp usage for video extraction.") + getCmd.PersistentFlags().String("ytdlp-path", "", "Path to youtube-dlp binary.") + // Alias support // As cobra doesn't support aliases natively (couldn't find a way to do it), we have to do it manually // This is a workaround to allow users to use `--hops` instead of `--max-hops` for example diff --git a/config/config.go b/config/config.go index 6ff70c99..fc30ffd1 100644 --- a/config/config.go +++ b/config/config.go @@ -76,6 +76,10 @@ type Config struct { NoStdoutLogging bool `mapstructure:"no-stdout-log"` NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` Handover bool `mapstructure:"handover"` + + // Dependencies + NoYTDLP bool `mapstructure:"no-ytdlp"` + YTDLPPath string `mapstructure:"ytdlp-path"` } var ( diff --git a/go.mod b/go.mod index dc8baa6d..1ab04290 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/google/uuid v1.6.0 github.com/gosuri/uilive v0.0.4 github.com/gosuri/uitable v0.0.4 + github.com/grafov/m3u8 v0.12.0 github.com/paulbellamy/ratecounter v0.2.0 github.com/philippgille/gokv/leveldb v0.7.0 github.com/prometheus/client_golang v1.20.3 diff --git a/go.sum b/go.sum index 10833203..d0a7accd 100644 --- a/go.sum +++ b/go.sum @@ -57,6 +57,12 @@ github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY= github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI= github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY= github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo= +github.com/grafana/pyroscope-go v1.1.2 h1:7vCfdORYQMCxIzI3NlYAs3FcBP760+gWuYWOyiVyYx8= +github.com/grafana/pyroscope-go v1.1.2/go.mod h1:HSSmHo2KRn6FasBA4vK7BMiQqyQq8KSuBKvrhkXxYPU= +github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg= +github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU= +github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4= +github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index fed41119..a605215e 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -1,20 +1,137 @@ package crawl import ( + "io" + "net/http" "net/url" "regexp" "strings" + "sync/atomic" "github.com/PuerkitoBio/goquery" "github.com/internetarchive/Zeno/internal/pkg/crawl/extractor" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/remeh/sizedwaitgroup" ) var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) +func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error { + var resp *http.Response + + // Prepare GET request + req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) + if err != nil { + return err + } + + req.Header.Set("Referer", utils.URLToString(item.ParentURL)) + req.Header.Set("User-Agent", c.UserAgent) + + // If headers are passed, apply them to the request + if headers != nil { + for key, value := range headers { + req.Header.Set(key, value) + } + } + + // Apply cookies obtained from the original URL captured + for i := range cookies { + req.AddCookie(cookies[i]) + } + + resp, err = c.executeGET(item, req, false) + if err != nil && err.Error() == "URL from redirection has already been seen" { + return nil + } else if err != nil { + return err + } + defer resp.Body.Close() + + if extractor.IsM3U8(resp) { + assets, err := extractor.M3U8(resp) + if err == nil { + c.captureAssets(item, assets, cookies, headers) + } else { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") + } + } + + io.Copy(io.Discard, resp.Body) + + return nil +} + +func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) { + // TODO: implement a counter for the number of assets + // currently being processed + // c.Frontier.QueueCount.Incr(int64(len(assets))) + swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) + excluded := false + + for _, asset := range assets { + // TODO: implement a counter for the number of assets + // currently being processed + // c.Frontier.QueueCount.Incr(-1) + + // Just making sure we do not over archive by archiving the original URL + if utils.URLToString(item.URL) == utils.URLToString(asset) { + continue + } + + // If the URL match any excluded string, we ignore it + for _, excludedString := range c.ExcludedStrings { + if strings.Contains(utils.URLToString(asset), excludedString) { + excluded = true + break + } + } + + if excluded { + excluded = false + continue + } + + swg.Add() + c.URIsPerSecond.Incr(1) + + go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { + defer swg.Done() + + // Create the asset's item + newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) + if err != nil { + c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ + "parentHop": item.Hop, + "parentUrl": utils.URLToString(item.URL), + "type": "asset", + })).Error("error while creating asset item") + return + } + + // Capture the asset + err = c.captureAsset(newAsset, cookies, headers) + if err != nil { + c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ + "parentHop": item.Hop, + "parentUrl": utils.URLToString(item.URL), + "type": "asset", + })).Error("error while capturing asset") + return + } + + // If we made it to this point, it means that the asset have been crawled successfully, + // then we can increment the locallyCrawled variable + atomic.AddUint64(&item.LocallyCrawled, 1) + }(asset, &swg) + } + + swg.Wait() +} + func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) { var rawAssets []string var URL = utils.URLToString(item.URL) @@ -198,7 +315,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu if err != nil { c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL) } else { - rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...) + rawAssets = append(rawAssets, URLsFromJSON...) } } } @@ -274,21 +391,26 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu // Turn strings into url.URL assets = append(assets, utils.StringSliceToURLSlice(rawAssets)...) - // Ensure that excluded hosts aren't in the assets. - assets = c.excludeHosts(assets) - - // Go over all assets and outlinks and make sure they are absolute links - assets = utils.MakeAbsolute(base, assets) + // Ensure that no asset that would be excluded is added to the list, + // remove all fragments, and make sure that all assets are absolute URLs + assets = c.cleanURLs(base, assets) return utils.DedupeURLs(assets), nil } -func removeGoogleVideoURLs(input []string) (output []string) { - for _, i := range input { - if !strings.Contains(i, "googlevideo.com") { - output = append(output, i) +func (c *Crawl) cleanURLs(base *url.URL, URLs []*url.URL) (output []*url.URL) { + // Remove excluded URLs + for _, URL := range URLs { + if !c.isExcluded(URL) { + output = append(output, URL) } } - return output + // Make all URLs absolute + if base != nil { + output = utils.MakeAbsolute(base, output) + } + + // Remove fragments + return utils.RemoveFragments(output) } diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 04436776..31be9f1e 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -7,7 +7,6 @@ import ( "net/url" "strings" "sync" - "sync/atomic" "time" "github.com/PuerkitoBio/goquery" @@ -19,9 +18,9 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/tiktok" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/truthsocial" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/vk" + "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/youtube" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/utils" - "github.com/remeh/sizedwaitgroup" ) func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bool) (resp *http.Response, err error) { @@ -189,37 +188,6 @@ func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bo return resp, nil } -func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { - var resp *http.Response - - // Prepare GET request - req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) - if err != nil { - return err - } - - req.Header.Set("Referer", utils.URLToString(item.ParentURL)) - req.Header.Set("User-Agent", c.UserAgent) - - // Apply cookies obtained from the original URL captured - for i := range cookies { - req.AddCookie(cookies[i]) - } - - resp, err = c.executeGET(item, req, false) - if err != nil && err.Error() == "URL from redirection has already been seen" { - return nil - } else if err != nil { - return err - } - defer resp.Body.Close() - - // needed for WARC writing - io.Copy(io.Discard, resp.Body) - - return nil -} - // Capture capture the URL and return the outlinks func (c *Crawl) Capture(item *queue.Item) error { var ( @@ -372,6 +340,34 @@ func (c *Crawl) Capture(item *queue.Item) error { } defer resp.Body.Close() + // If it was a YouTube watch page, we potentially want to run it through the YouTube extractor + // TODO: support other watch page URLs + if !c.NoYTDLP && youtube.IsYouTubeWatchPage(item.URL) { + URLs, rawJSON, HTTPHeaders, err := youtube.Parse(resp.Body) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page") + return err + } + resp.Body.Close() + + // Write the metadata record for the video + if rawJSON != "" { + c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dlp", rawJSON) + } + + var headers = make(map[string]string) + headers["Accept"] = HTTPHeaders.Accept + headers["Accept-Language"] = HTTPHeaders.AcceptLanguage + headers["Sec-Fetch-Mode"] = HTTPHeaders.SecFetchMode + headers["User-Agent"] = HTTPHeaders.UserAgent + + if len(URLs) > 0 { + c.captureAssets(item, URLs, resp.Cookies(), headers) + } + + return nil + } + // Scrape potential URLs from Link HTTP header var ( links = Parse(resp.Header.Get("link")) @@ -403,6 +399,11 @@ func (c *Crawl) Capture(item *queue.Item) error { if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from JSON") } + } else if extractor.IsM3U8(resp) { + assets, err = extractor.M3U8(resp) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") + } } else if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) { // If the response isn't a text/*, we do not scrape it. // We also aren't going to scrape if assets and outlinks are turned off. @@ -483,7 +484,7 @@ func (c *Crawl) Capture(item *queue.Item) error { } // Extract outlinks - outlinks, err := extractOutlinks(base, doc) + outlinks, err := c.extractOutlinks(base, doc) if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks") return err @@ -551,75 +552,7 @@ func (c *Crawl) Capture(item *queue.Item) error { } } - // TODO: implement a counter for the number of assets - // currently being processed - // c.Frontier.QueueCount.Incr(int64(len(assets))) - swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) - excluded := false - - for _, asset := range assets { - // TODO: implement a counter for the number of assets - // currently being processed - // c.Frontier.QueueCount.Incr(-1) - - // Just making sure we do not over archive by archiving the original URL - if utils.URLToString(item.URL) == utils.URLToString(asset) { - continue - } - - // We ban googlevideo.com URLs because they are heavily rate limited by default, and - // we don't want the crawler to spend an innapropriate amount of time archiving them - if strings.Contains(item.URL.Host, "googlevideo.com") { - continue - } - - // If the URL match any excluded string, we ignore it - for _, excludedString := range c.ExcludedStrings { - if strings.Contains(utils.URLToString(asset), excludedString) { - excluded = true - break - } - } - - if excluded { - excluded = false - continue - } - - swg.Add() - c.URIsPerSecond.Incr(1) - - go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { - defer swg.Done() - - // Create the asset's item - newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Error("error while creating asset item") - return - } - - // Capture the asset - err = c.captureAsset(newAsset, resp.Cookies()) - if err != nil { - c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Error("error while capturing asset") - return - } - - // If we made it to this point, it means that the asset have been crawled successfully, - // then we can increment the locallyCrawled variable - atomic.AddUint64(&item.LocallyCrawled, 1) - }(asset, &swg) - } + c.captureAssets(item, assets, resp.Cookies(), nil) - swg.Wait() return err } diff --git a/internal/pkg/crawl/config.go b/internal/pkg/crawl/config.go index 8e48da19..2d383460 100644 --- a/internal/pkg/crawl/config.go +++ b/internal/pkg/crawl/config.go @@ -97,7 +97,7 @@ type Crawl struct { CDXDedupeServer string WARCFullOnDisk bool WARCPoolSize int - WARCDedupSize int + WARCDedupeSize int DisableLocalDedupe bool CertValidation bool WARCCustomCookie string @@ -117,6 +117,10 @@ type Crawl struct { HQProducerChannel chan *queue.Item HQChannelsWg *sync.WaitGroup HQRateLimitingSendBack bool + + // Dependencies + NoYTDLP bool + YTDLPPath string } func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { @@ -235,7 +239,7 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.CertValidation = config.CertValidation c.WARCFullOnDisk = config.WARCOnDisk c.WARCPoolSize = config.WARCPoolSize - c.WARCDedupSize = config.WARCDedupeSize + c.WARCDedupeSize = config.WARCDedupeSize c.WARCCustomCookie = config.CDXCookie c.API = config.API @@ -250,6 +254,10 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.PrometheusMetrics.Prefix = config.PrometheusPrefix } + // Dependencies + c.NoYTDLP = config.NoYTDLP + c.YTDLPPath = config.YTDLPPath + if config.UserAgent != "Zeno" { c.UserAgent = config.UserAgent } else { diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index d93dbd6f..a7549136 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -11,6 +11,7 @@ import ( "git.archive.org/wb/gocrawlhq" "github.com/CorentinB/warc" + "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/seencheck" "github.com/internetarchive/Zeno/internal/pkg/utils" @@ -92,9 +93,9 @@ func (c *Crawl) Start() (err error) { // Init WARC rotator settings rotatorSettings := c.initWARCRotatorSettings() - dedupeOptions := warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, SizeThreshold: c.WARCDedupSize} + dedupeOptions := warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, SizeThreshold: c.WARCDedupeSize} if c.CDXDedupeServer != "" { - dedupeOptions = warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, CDXDedupe: true, CDXURL: c.CDXDedupeServer, CDXCookie: c.WARCCustomCookie, SizeThreshold: c.WARCDedupSize} + dedupeOptions = warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, CDXDedupe: true, CDXURL: c.CDXDedupeServer, CDXCookie: c.WARCCustomCookie, SizeThreshold: c.WARCDedupeSize} } // Init the HTTP client responsible for recording HTTP(s) requests / responses @@ -120,8 +121,12 @@ func (c *Crawl) Start() (err error) { } }() - c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second - c.Log.Info("HTTP client timeout set", "timeout", c.HTTPTimeout) + if c.HTTPTimeout > 0 { + c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second + c.Log.Info("Global HTTP client timeout set", "timeout", c.HTTPTimeout) + } else { + c.Log.Info("Global HTTP client timeout not set (defaulting to infinite)") + } if c.Proxy != "" { proxyHTTPClientSettings := HTTPClientSettings @@ -149,6 +154,20 @@ func (c *Crawl) Start() (err error) { go c.startAPI() } + // Verify that dependencies exist on the system + if !c.NoYTDLP { + // If a yt-dlp path is specified, we use it, + // otherwise we try to find yt-dlp on the system + if c.YTDLPPath == "" { + path, found := ytdlp.FindPath() + if !found { + c.Log.Warn("yt-dlp not found on the system, please install it or specify the path in the configuration if you wish to use it") + } else { + c.YTDLPPath = path + } + } + } + // Parse input cookie file if specified if c.CookieFile != "" { cookieJar, err := cookiejar.NewFileJar(c.CookieFile, nil) diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go new file mode 100644 index 00000000..ea7d892d --- /dev/null +++ b/internal/pkg/crawl/dependencies/ytdlp/model.go @@ -0,0 +1,102 @@ +package ytdlp + +type Subtitle struct { + Ext string `json:"ext"` + URL string `json:"url"` + Name string `json:"name"` +} + +type Video struct { + ID string `json:"id"` + IsLive bool `json:"is_live"` + Subtitles map[string][]Subtitle `json:"subtitles"` + RequestedFormats []struct { + Acodec string `json:"acodec"` + AspectRatio float64 `json:"aspect_ratio"` + Asr interface{} `json:"asr"` + AudioChannels interface{} `json:"audio_channels"` + AudioExt string `json:"audio_ext"` + Container string `json:"container"` + DynamicRange string `json:"dynamic_range"` + Ext string `json:"ext"` + Filesize float64 `json:"filesize"` + Format string `json:"format"` + FormatID string `json:"format_id"` + FormatNote string `json:"format_note"` + Fps float64 `json:"fps"` + Fragments []struct { + URL string `json:"url"` + } `json:"fragments"` + HasDrm bool `json:"has_drm"` + Height float64 `json:"height"` + HTTPHeaders HTTPHeaders `json:"http_headers"` + Language interface{} `json:"language"` + LanguagePreference float64 `json:"language_preference"` + Preference interface{} `json:"preference"` + Protocol string `json:"protocol"` + Quality float64 `json:"quality"` + Resolution string `json:"resolution"` + SourcePreference float64 `json:"source_preference"` + Tbr float64 `json:"tbr"` + URL string `json:"url"` + Vbr float64 `json:"vbr,omitempty"` + Vcodec string `json:"vcodec"` + VideoExt string `json:"video_ext"` + Width float64 `json:"width"` + Abr float64 `json:"abr,omitempty"` + } `json:"requested_formats"` + Formats []struct { + Acodec string `json:"acodec"` + AspectRatio float64 `json:"aspect_ratio"` + AudioExt string `json:"audio_ext"` + Columns float64 `json:"columns,omitempty"` + Ext string `json:"ext"` + Format string `json:"format"` + FormatID string `json:"format_id"` + FormatNote string `json:"format_note"` + Fps float64 `json:"fps"` + Fragments []struct { + Duration float64 `json:"duration"` + URL string `json:"url"` + } `json:"fragments,omitempty"` + Height float64 `json:"height"` + HTTPHeaders struct { + Accept string `json:"Accept"` + AcceptLanguage string `json:"Accept-Language"` + SecFetchMode string `json:"Sec-Fetch-Mode"` + UserAgent string `json:"User-Agent"` + } `json:"http_headers"` + Protocol string `json:"protocol"` + Resolution string `json:"resolution"` + Rows float64 `json:"rows,omitempty"` + URL string `json:"url"` + Vcodec string `json:"vcodec"` + VideoExt string `json:"video_ext"` + Width float64 `json:"width"` + Abr float64 `json:"abr,omitempty"` + Asr float64 `json:"asr,omitempty"` + AudioChannels float64 `json:"audio_channels,omitempty"` + Container string `json:"container,omitempty"` + DynamicRange interface{} `json:"dynamic_range,omitempty"` + Filesize float64 `json:"filesize,omitempty"` + HasDrm bool `json:"has_drm,omitempty"` + Language string `json:"language,omitempty"` + LanguagePreference float64 `json:"language_preference,omitempty"` + Preference interface{} `json:"preference,omitempty"` + Quality float64 `json:"quality,omitempty"` + SourcePreference float64 `json:"source_preference,omitempty"` + Tbr float64 `json:"tbr,omitempty"` + Vbr float64 `json:"vbr,omitempty"` + FilesizeApprox float64 `json:"filesize_approx,omitempty"` + } `json:"formats"` + Thumbnails []struct { + URL string `json:"url"` + } `json:"thumbnails"` +} + +type HTTPHeaders struct { + Accept string `json:"Accept"` + AcceptLanguage string `json:"Accept-Language"` + SecFetchMode string `json:"Sec-Fetch-Mode"` + UserAgent string `json:"User-Agent"` +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/server.go b/internal/pkg/crawl/dependencies/ytdlp/server.go new file mode 100644 index 00000000..334c7ee4 --- /dev/null +++ b/internal/pkg/crawl/dependencies/ytdlp/server.go @@ -0,0 +1,46 @@ +package ytdlp + +import ( + "io" + "net" + "net/http" + "strings" +) + +func ServeBody(body io.ReadCloser) (port int, stopChan chan struct{}, err error) { + stopChan = make(chan struct{}) + portChan := make(chan int) + + bodyBytes, err := io.ReadAll(body) + if err != nil { + return 0, nil, err + } + + // Start the server + go func() { + // Serve the body on the random port + listener, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + panic(err) + } + defer listener.Close() + + portChan <- listener.Addr().(*net.TCPAddr).Port + + go func() { + <-stopChan + listener.Close() + }() + + // Create a handler that will serve the body on / + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Write(bodyBytes) + }) + + if err := http.Serve(listener, handler); err != nil && !strings.Contains(err.Error(), "use of closed network connection") { + return + } + }() + + return <-portChan, stopChan, nil +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go new file mode 100644 index 00000000..0a4f5fbe --- /dev/null +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -0,0 +1,80 @@ +package ytdlp + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "strconv" +) + +func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, err error) { + // Prepare the command + cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port), "-f", "bv[protocol=https]+ba[protocol=https]") + + // Buffers to capture stdout and stderr + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + // Run the command + err = cmd.Run() + if err != nil { + return URLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) + } + + output := stdout.String() + + // Parse the output as a Video object + var video Video + err = json.Unmarshal([]byte(output), &video) + if err != nil { + return nil, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) + } + + // Get all subtitles (not automatic captions) + var subtitleURLs []string + for _, subtitle := range video.Subtitles { + for _, sub := range subtitle { + subtitleURLs = append(subtitleURLs, sub.URL) + } + } + + // Get all thumbnail URLs + for _, thumbnail := range video.Thumbnails { + URLs = append(URLs, thumbnail.URL) + } + + // Get the manifest URL for the best video & audio quality + // Note: we do not archive live streams + if !video.IsLive { + if len(video.RequestedFormats) > 0 { + HTTPHeaders = video.RequestedFormats[0].HTTPHeaders + for _, format := range video.RequestedFormats { + URLs = append(URLs, format.URL+"&video_id="+video.ID) + } + } + } + + // Get the storyboards + for _, format := range video.Formats { + if format.FormatNote == "storyboard" { + URLs = append(URLs, format.URL) + for _, fragment := range format.Fragments { + URLs = append(URLs, fragment.URL) + } + } + } + + URLs = append(URLs, subtitleURLs...) + + return URLs, output, HTTPHeaders, nil +} + +func FindPath() (string, bool) { + path, err := exec.LookPath("yt-dlp") + if err != nil { + return "", false + } + return path, true +} diff --git a/internal/pkg/crawl/exclusion.go b/internal/pkg/crawl/exclusion.go new file mode 100644 index 00000000..192a19f6 --- /dev/null +++ b/internal/pkg/crawl/exclusion.go @@ -0,0 +1,44 @@ +package crawl + +import ( + "net/url" + "strings" + + "github.com/internetarchive/Zeno/internal/pkg/utils" +) + +func (c *Crawl) isExcluded(URL *url.URL) bool { + // If Zeno is ran with --include-host flag, + // only URLs from the included hosts are crawled + if !c.isHostIncluded(URL) { + return false + } + + // Verify if the URL is excluded by the host + // (--exclude-host flag) + if c.isHostExcluded(URL) { + return true + } + + // Verify if the URL is excluded by the --exclude-string flag + for _, excludedString := range c.ExcludedStrings { + if strings.Contains(utils.URLToString(URL), excludedString) { + return true + } + } + + return false +} + +func (c *Crawl) isHostExcluded(URL *url.URL) bool { + return utils.StringInSlice(URL.Host, c.ExcludedHosts) +} + +func (c *Crawl) isHostIncluded(URL *url.URL) bool { + // If no hosts are included, all hosts are included + if len(c.IncludedHosts) == 0 { + return true + } + + return utils.StringInSlice(URL.Host, c.IncludedHosts) +} diff --git a/internal/pkg/crawl/extractor/m3u8.go b/internal/pkg/crawl/extractor/m3u8.go new file mode 100644 index 00000000..a3b3ee32 --- /dev/null +++ b/internal/pkg/crawl/extractor/m3u8.go @@ -0,0 +1,49 @@ +package extractor + +import ( + "net/http" + "net/url" + + "github.com/grafov/m3u8" +) + +func IsM3U8(resp *http.Response) bool { + return isContentType(resp.Header.Get("Content-Type"), "application/vnd.apple.mpegurl") || + isContentType(resp.Header.Get("Content-Type"), "application/x-mpegURL") +} + +func M3U8(resp *http.Response) (URLs []*url.URL, err error) { + p, listType, err := m3u8.DecodeFrom(resp.Body, true) + if err != nil { + return URLs, err + } + + var rawURLs []string + switch listType { + case m3u8.MEDIA: + mediapl := p.(*m3u8.MediaPlaylist) + + for _, segment := range mediapl.Segments { + if segment != nil { + rawURLs = append(rawURLs, segment.URI) + } + } + case m3u8.MASTER: + masterpl := p.(*m3u8.MasterPlaylist) + + for _, variant := range masterpl.Variants { + if variant != nil { + rawURLs = append(rawURLs, variant.URI) + } + } + } + + for _, rawURL := range rawURLs { + URL, err := url.Parse(rawURL) + if err == nil { + URLs = append(URLs, URL) + } + } + + return URLs, err +} diff --git a/internal/pkg/crawl/extractor/utils.go b/internal/pkg/crawl/extractor/utils.go index 3d8ee94d..bf01e1d8 100644 --- a/internal/pkg/crawl/extractor/utils.go +++ b/internal/pkg/crawl/extractor/utils.go @@ -3,8 +3,17 @@ package extractor import ( "net/url" "sort" + "strings" ) +func isContentType(header, targetContentType string) bool { + // Lowercase the header and target content type for case-insensitive comparison + header = strings.ToLower(header) + targetContentType = strings.ToLower(targetContentType) + + return strings.Contains(header, targetContentType) +} + // compareURLs compares two slices of *url.URL func compareURLs(a, b []*url.URL) bool { if len(a) != len(b) { diff --git a/internal/pkg/crawl/finish.go b/internal/pkg/crawl/finish.go index b34d920e..e5b49d7e 100644 --- a/internal/pkg/crawl/finish.go +++ b/internal/pkg/crawl/finish.go @@ -8,7 +8,7 @@ import ( ) // catchFinish is running in the background and detect when the crawl need to be terminated -// because it won't crawl anything more. This doesn't apply for Kafka-powered crawls. +// because it won't crawl anything more. This doesn't apply for HQ-powered crawls. func (crawl *Crawl) catchFinish() { for crawl.CrawledSeeds.Value()+crawl.CrawledAssets.Value() <= 0 { time.Sleep(1 * time.Second) diff --git a/internal/pkg/crawl/outlinks.go b/internal/pkg/crawl/outlinks.go index b838b803..e66c02fa 100644 --- a/internal/pkg/crawl/outlinks.go +++ b/internal/pkg/crawl/outlinks.go @@ -10,7 +10,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/utils" ) -func extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, err error) { +func (c *Crawl) extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, err error) { var rawOutlinks []string // Extract outlinks @@ -43,11 +43,9 @@ func extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, textOutlinks := extractLinksFromText(doc.Find("body").RemoveFiltered("script").Text()) outlinks = append(outlinks, textOutlinks...) - // Go over all outlinks and make sure they are absolute links - outlinks = utils.MakeAbsolute(base, outlinks) - - // Hash (or fragment) URLs are navigational links pointing to the exact same page as such, they should not be treated as new outlinks. - outlinks = utils.RemoveFragments(outlinks) + // Ensure that no outlink that would be excluded is added to the list, + // remove all fragments, and make sure that all assets are absolute URLs + outlinks = c.cleanURLs(base, outlinks) return utils.DedupeURLs(outlinks), nil } @@ -55,33 +53,9 @@ func extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *queue.Item, wg *sync.WaitGroup) { defer wg.Done() - var excluded bool - // Send the outlinks to the pool of workers var items = make([]*queue.Item, 0, len(outlinks)) for _, outlink := range outlinks { - outlink := outlink - - // If the host of the outlink is in the host exclusion list, or the host is not in the host inclusion list - // if one is specified, we ignore the outlink - if utils.StringInSlice(outlink.Host, c.ExcludedHosts) || !c.checkIncludedHosts(outlink.Host) { - continue - } - - // If the outlink match any excluded string, we ignore it - for _, excludedString := range c.ExcludedStrings { - if strings.Contains(utils.URLToString(outlink), excludedString) { - excluded = true - break - } - } - - if excluded { - excluded = false - continue - } - - // Seencheck the outlink if c.UseSeencheck { if c.Seencheck.SeencheckURL(utils.URLToString(outlink), "seed") { continue diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube.go b/internal/pkg/crawl/sitespecific/youtube/youtube.go new file mode 100644 index 00000000..ab5059db --- /dev/null +++ b/internal/pkg/crawl/sitespecific/youtube/youtube.go @@ -0,0 +1,40 @@ +package youtube + +import ( + "io" + "net/url" + "strings" + + "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" +) + +func IsYouTubeWatchPage(URL *url.URL) bool { + return strings.Contains(URL.Host, "youtube.com") && (strings.Contains(URL.Path, "/watch") || strings.Contains(URL.Path, "/v/")) +} + +func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders ytdlp.HTTPHeaders, err error) { + // Create a temporary server to serve the body and call ytdlp on it + port, stopChan, err := ytdlp.ServeBody(body) + if err != nil { + return nil, rawJSON, HTTPHeaders, err + } + defer close(stopChan) + + // Call ytdlp on the temporary server + rawURLs, rawJSON, HTTPHeaders, err := ytdlp.GetJSON(port) + if err != nil { + return nil, rawJSON, HTTPHeaders, err + } + + // Parse the URLs + for _, urlString := range rawURLs { + URL, err := url.Parse(urlString) + if err != nil { + return nil, rawJSON, HTTPHeaders, err + } + + URLs = append(URLs, URL) + } + + return URLs, rawJSON, HTTPHeaders, nil +} diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go new file mode 100644 index 00000000..5a86ade0 --- /dev/null +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go @@ -0,0 +1,32 @@ +package youtube + +import ( + "os" + "testing" +) + +func TestParse(t *testing.T) { + // Make io.ReadCloser from the youtube_test.html file + f, err := os.Open("youtube_test.html") + if err != nil { + t.Fatal(err) + } + defer f.Close() + + // Parse the video + URLs, rawJSON, _, err := Parse(f) + if err != nil { + t.Fatal(err) + } + + // Check the raw JSON + if rawJSON == "" { + t.Fatal("Expected non-empty raw JSON") + } + + // Check the number of URLs + expected := 204 + if len(URLs) != expected { + t.Fatalf("Expected %d URLs, got %d", expected, len(URLs)) + } +} diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.html b/internal/pkg/crawl/sitespecific/youtube/youtube_test.html new file mode 100644 index 00000000..77474015 --- /dev/null +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.html @@ -0,0 +1,88 @@ +$10,000 Every Day You Survive In A Grocery Store - YouTube
PrésentationPresseDroits d'auteurNous contacterCréateursPublicitéDéveloppeursRésilier vos abonnementsConditions d'utilisationConfidentialitéRègles et sécuritéPremiers pas sur YouTubeTester de nouvelles fonctionnalités
\ No newline at end of file diff --git a/internal/pkg/crawl/utils.go b/internal/pkg/crawl/utils.go index da9ccbe4..d59434cd 100644 --- a/internal/pkg/crawl/utils.go +++ b/internal/pkg/crawl/utils.go @@ -39,15 +39,6 @@ func (c *Crawl) crawlSpeedLimiter() { } } -func (c *Crawl) checkIncludedHosts(host string) bool { - // If no hosts are included, all hosts are included - if len(c.IncludedHosts) == 0 { - return true - } - - return utils.StringInSlice(host, c.IncludedHosts) -} - func (c *Crawl) handleCrawlPause() { for { spaceLeft := float64(utils.GetFreeDiskSpace(c.JobPath).Avail) / float64(GB) @@ -65,18 +56,6 @@ func (c *Crawl) handleCrawlPause() { } } -func (c *Crawl) excludeHosts(URLs []*url.URL) (output []*url.URL) { - for _, URL := range URLs { - if utils.StringInSlice(URL.Host, c.ExcludedHosts) || !c.checkIncludedHosts(URL.Host) { - continue - } else { - output = append(output, URL) - } - } - - return output -} - func extractLinksFromText(source string) (links []*url.URL) { // Extract links and dedupe them rawLinks := utils.DedupeStrings(regexOutlinks.FindAllString(source, -1)) diff --git a/internal/pkg/crawl/worker.go b/internal/pkg/crawl/worker.go index b1906f5c..2c790adf 100644 --- a/internal/pkg/crawl/worker.go +++ b/internal/pkg/crawl/worker.go @@ -114,7 +114,7 @@ func (w *Worker) Run() { w.state.lastAction = "got item" // If the host of the item is in the host exclusion list, we skip it - if utils.StringInSlice(item.URL.Host, w.pool.Crawl.ExcludedHosts) || !w.pool.Crawl.checkIncludedHosts(item.URL.Host) { + if utils.StringInSlice(item.URL.Host, w.pool.Crawl.ExcludedHosts) || !w.pool.Crawl.isHostIncluded(item.URL) { if w.pool.Crawl.UseHQ { w.state.lastAction = "skipping item because of host exclusion" // If we are using the HQ, we want to mark the item as done