Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for ina.fr videos #152

Merged
merged 3 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/internetarchive/Zeno
go 1.22.4

require (
git.archive.org/wb/gocrawlhq v1.2.13
github.com/internetarchive/gocrawlhq v1.2.14
github.com/CorentinB/warc v0.8.53
github.com/PuerkitoBio/goquery v1.9.3
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGMUJqR4HBl4+8=
github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s=
github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0=
Expand Down
7 changes: 3 additions & 4 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ func (c *Crawl) seencheckAssets(assets []*url.URL, item *queue.Item) []*url.URL
if found {
continue
}

seencheckedBatch = append(seencheckedBatch, URL)
}

Expand All @@ -183,15 +184,13 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
var URL = utils.URLToString(item.URL)

// Execute plugins on the response
if strings.Contains(base.Host, "cloudflarestream.com") {
if cloudflarestream.IsURL(URL) {
cloudflarestreamURLs, err := cloudflarestream.GetSegments(base, *c.Client)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Warn("error getting cloudflarestream segments")
}

if len(cloudflarestreamURLs) > 0 {
assets = append(assets, cloudflarestreamURLs...)
}
assets = append(assets, cloudflarestreamURLs...)
}

// Get assets from JSON payloads in data-item values
Expand Down
45 changes: 37 additions & 8 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/facebook"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/ina"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/libsyn"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/reddit"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/telegram"
Expand Down Expand Up @@ -309,7 +310,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
}
} else if vk.IsVKURL(utils.URLToString(item.URL)) {
vk.AddHeaders(req)
} else if reddit.IsRedditURL(utils.URLToString(item.URL)) {
} else if reddit.IsURL(utils.URLToString(item.URL)) {
reddit.AddCookies(req)
}

Expand Down Expand Up @@ -392,15 +393,12 @@ func (c *Crawl) Capture(item *queue.Item) error {
}

return nil
} else if reddit.IsRedditPostAPI(req) {
body, err := io.ReadAll(resp.Body)
} else if reddit.IsPostAPI(req) {
permalinks, rawAssets, err := reddit.ExtractPost(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading response body")
return err
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract post from Reddit")
}

permalinks, rawAssets, err := reddit.ExtractPost(body)

// Queue the permalinks
waitGroup.Add(1)
go c.queueOutlinks(utils.StringSliceToURLSlice(permalinks), item, &waitGroup)
Expand All @@ -416,6 +414,26 @@ func (c *Crawl) Capture(item *queue.Item) error {
}

return nil
} else if ina.IsAPIURL(req) {
rawAssets, err := ina.ExtractMedias(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract medias from INA")
}

if len(rawAssets) != 0 {
assets = c.seencheckAssets(rawAssets, item)

if len(assets) != 0 {
for _, asset := range rawAssets {
playerItem, err := queue.NewItem(asset, item.URL, "seed", 0, "", false)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from asset")
} else {
c.Capture(playerItem)
}
}
}
}
}

// Scrape potential URLs from Link HTTP header
Expand Down Expand Up @@ -480,7 +498,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
}

// Execute site-specific code on the document
if strings.Contains(base.Host, "cloudflarestream.com") {
if cloudflarestream.IsURL(base.Host) {
// Look for JS files necessary for the playback of the video
cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client)
if err != nil {
Expand Down Expand Up @@ -513,6 +531,17 @@ func (c *Crawl) Capture(item *queue.Item) error {
"type": "asset",
})).Info("URL archived")
}
} else if ina.IsURL(req) {
playerURLs := ina.ExtractPlayerURLs(doc, c.Client)

for _, playerURL := range playerURLs {
playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from player URL")
} else {
c.Capture(playerItem)
}
}
}

// Websites can use a <base> tag to specify a base for relative URLs in every other tags.
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/crawl/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ import (
"sync"
"time"

"git.archive.org/wb/gocrawlhq"
"github.com/CorentinB/warc"
"github.com/google/uuid"
"github.com/internetarchive/Zeno/config"
"github.com/internetarchive/Zeno/internal/pkg/log"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/seencheck"
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/internetarchive/gocrawlhq"
"github.com/paulbellamy/ratecounter"
)

Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/crawl/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ import (
"sync"
"time"

"git.archive.org/wb/gocrawlhq"
"github.com/CorentinB/warc"
"github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/seencheck"
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/internetarchive/gocrawlhq"
"github.com/prometheus/client_golang/prometheus"
"github.com/telanflow/cookiejar"
"mvdan.cc/xurls/v2"
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/crawl/hq.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ import (
"sync"
"time"

"git.archive.org/wb/gocrawlhq"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/internetarchive/gocrawlhq"
)

// This function connects to HQ's websocket and listen for messages.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ type MPD struct {
} `xml:"Period"`
}

func IsURL(URL string) bool {
return strings.Contains(URL, "cloudflarestream.com")
}

func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.CustomHTTPClient) (archivedURLs []string, err error) {
var latestJSURL string

Expand Down
195 changes: 195 additions & 0 deletions internal/pkg/crawl/sitespecific/ina/ina.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
package ina

import (
"encoding/json"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"sync"
"time"

"github.com/CorentinB/warc"
"github.com/PuerkitoBio/goquery"
"github.com/internetarchive/Zeno/internal/pkg/utils"
)

var (
playerVersion string
playerVersionLock sync.Mutex
playerRegex *regexp.Regexp
)

func init() {
playerRegex = regexp.MustCompile(`"//ssl\.p\.jwpcdn\.com[^"]+\.js"`)
}

type APIResponse struct {
ID string `json:"id"`
Title string `json:"title"`
Description string `json:"description"`
DateOfBroadcast time.Time `json:"dateOfBroadcast"`
Type string `json:"type"`
Duration int `json:"duration"`
Categories []any `json:"categories"`
Credits []struct {
Context struct {
Vocab string `json:"@vocab"`
Hydra string `json:"hydra"`
Name string `json:"name"`
Value string `json:"value"`
Attributes string `json:"attributes"`
} `json:"@context"`
Type string `json:"@type"`
ID string `json:"@id"`
Name string `json:"name"`
Value string `json:"value"`
Attributes []struct {
Context struct {
Vocab string `json:"@vocab"`
Hydra string `json:"hydra"`
Key string `json:"key"`
Value string `json:"value"`
} `json:"@context"`
Type string `json:"@type"`
ID string `json:"@id"`
Key string `json:"key"`
Value string `json:"value"`
} `json:"attributes"`
} `json:"credits"`
Restrictions []any `json:"restrictions"`
ResourceURL string `json:"resourceUrl"`
ResourceThumbnail string `json:"resourceThumbnail"`
RestrictedBroadcastCountries []any `json:"restrictedBroadcastCountries"`
EmbedURL string `json:"embedUrl"`
AllowEmbed bool `json:"allowEmbed"`
Ratio string `json:"ratio"`
CollectionTitle string `json:"collectionTitle"`
IsOnline bool `json:"isOnline"`
AllowAds bool `json:"allowAds"`
TypeMedia string `json:"typeMedia"`
HideLogo bool `json:"hideLogo"`
URI string `json:"uri"`
AdvertisingAsset bool `json:"advertisingAsset"`
}

func IsURL(req *http.Request) bool {
return strings.Contains(utils.URLToString(req.URL), "ina.fr")
}

func IsAPIURL(req *http.Request) bool {
return strings.Contains(utils.URLToString(req.URL), "apipartner.ina.fr") && !strings.Contains(utils.URLToString(req.URL), "playerConfigurations.json")
}

func ExtractPlayerURLs(doc *goquery.Document, c *warc.CustomHTTPClient) []*url.URL {
var assets []string

doc.Find("div[data-type=player]").Each(func(i int, s *goquery.Selection) {
if playerConfigURL, exists := s.Attr("config-url"); exists {
assets = append(assets, playerConfigURL)
}

if assetDetailsURL, exists := s.Attr("asset-details-url"); exists {
assets = append(assets, assetDetailsURL)
}

if posterURL, exists := s.Attr("poster"); exists {
assets = append(assets, posterURL)
}
})

assets = append(assets, getJWPlayerURLs(c)...)

return utils.StringSliceToURLSlice(assets)
}

func getJWPlayerURLs(c *warc.CustomHTTPClient) (URLs []string) {
playerVersionLock.Lock()
defer playerVersionLock.Unlock()

if playerVersion == "" {
resp, err := c.Get("https://player-hub.ina.fr/version")
if err != nil {
return URLs
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return URLs
}

body, err := io.ReadAll(resp.Body)
if err != nil {
return URLs
}

playerVersion = string(body)

URLs = append(URLs,
"https://player-hub.ina.fr/dist/ina-player.min.js?version="+playerVersion,
"https://player-hub.ina.fr/dist/player-default-skin.min.css?version="+playerVersion,
"https://player-hub.ina.fr/assets/player/svg/pause.svg",
"https://player-hub.ina.fr/assets/player/svg/play.svg",
"https://player-hub.ina.fr/assets/player/svg/backward.svg",
"https://player-hub.ina.fr/assets/player/svg/forward.svg",
)

// Get the JWPlayer JS code
playerResp, err := c.Get("https://player-hub.ina.fr/js/jwplayer/jwplayer.js?version=" + playerVersion)
if err != nil {
return URLs
}
defer playerResp.Body.Close()

if playerResp.StatusCode != http.StatusOK {
return URLs
}

// Find the JWPlayer assets in the JS file
body, err = io.ReadAll(playerResp.Body)
if err != nil {
return URLs
}

matches := playerRegex.FindAllString(string(body), -1)

// Clean up the matches (remove quotes)
for _, match := range matches {
URLs = append(URLs, "https:"+match[1:len(match)-1])
}

URLs = append(URLs, "https://ssl.p.jwpcdn.com/player/v/"+extractJWPlayerVersion(string(body))+"/jwplayer.core.controls.html5.js")
}

return URLs
}

func extractJWPlayerVersion(body string) string {
lines := strings.Split(body, "\n")
for _, line := range lines {
if strings.Contains(line, "JW Player version") {
return strings.Split(line, "JW Player version ")[1]
}
}
return ""
}

func ExtractMedias(resp *http.Response) ([]*url.URL, error) {
var assets []string

body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

var data APIResponse
err = json.Unmarshal(body, &data)
if err != nil {
return nil, err
}

assets = append(assets, data.ResourceURL, data.ResourceThumbnail, "https://player.ina.fr"+data.EmbedURL, data.URI)

return utils.StringSliceToURLSlice(assets), nil
}
Loading