Skip to content

Commit

Permalink
Feat/Jaccard Similarity (#5)
Browse files Browse the repository at this point in the history
* new: feat: add sorting and filtering by similarity

* chg: fix: imdb link retrieval

* chg: fix: do not filter when queryis empty
  • Loading branch information
felipemarinho97 authored Mar 10, 2024
1 parent 322bb34 commit 268ece5
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 4 deletions.
25 changes: 24 additions & 1 deletion api/bludv.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@ import (
"net/http"
"net/url"
"regexp"
"slices"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/hbollon/go-edlib"

"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
"github.com/felipemarinho97/torrent-indexer/utils"
)

var bludv = IndexerMeta{
Expand All @@ -29,7 +33,7 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
}()

ctx := r.Context()
// supported query params: q, season, episode
// supported query params: q, season, episode, filter_results
q := r.URL.Query().Get("q")

// URL encode query param
Expand Down Expand Up @@ -87,6 +91,25 @@ func (i *Indexer) HandlerBluDVIndexer(w http.ResponseWriter, r *http.Request) {
}
}

for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}

// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
return it.Similarity > 0
})
}

// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
})

w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
Expand Down
24 changes: 23 additions & 1 deletion api/comando_torrents.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@ import (
"net/http"
"net/url"
"regexp"
"slices"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/felipemarinho97/torrent-indexer/magnet"
"github.com/felipemarinho97/torrent-indexer/schema"
goscrape "github.com/felipemarinho97/torrent-indexer/scrape"
"github.com/felipemarinho97/torrent-indexer/utils"
"github.com/hbollon/go-edlib"
)

var comando = IndexerMeta{
Expand Down Expand Up @@ -104,6 +107,25 @@ func (i *Indexer) HandlerComandoIndexer(w http.ResponseWriter, r *http.Request)
}
}

for i, it := range indexedTorrents {
jLower := strings.ReplaceAll(strings.ToLower(fmt.Sprintf("%s %s", it.Title, it.OriginalTitle)), ".", " ")
qLower := strings.ToLower(q)
splitLength := 2
indexedTorrents[i].Similarity = edlib.JaccardSimilarity(jLower, qLower, splitLength)
}

// remove the ones with zero similarity
if len(indexedTorrents) > 20 && r.URL.Query().Get("filter_results") != "" && r.URL.Query().Get("q") != "" {
indexedTorrents = utils.Filter(indexedTorrents, func(it IndexedTorrent) bool {
return it.Similarity > 0
})
}

// sort by similarity
slices.SortFunc(indexedTorrents, func(i, j IndexedTorrent) int {
return int((j.Similarity - i.Similarity) * 1000)
})

w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(Response{
Results: indexedTorrents,
Expand Down Expand Up @@ -176,7 +198,7 @@ func getTorrents(ctx context.Context, i *Indexer, link string) ([]IndexedTorrent

// find any link from imdb
imdbLink := ""
article.Find("div.content a").Each(func(i int, s *goquery.Selection) {
article.Find("a").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
re := regexp.MustCompile(`https://www.imdb.com/title/(tt\d+)`)
matches := re.FindStringSubmatch(link)
Expand Down
7 changes: 5 additions & 2 deletions api/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type IndexedTorrent struct {
Size string `json:"size"`
LeechCount int `json:"leech_count"`
SeedCount int `json:"seed_count"`
Similarity float32 `json:"similarity"`
}

func NewIndexers(redis *cache.Redis, metrics *monitoring.Metrics) *Indexer {
Expand All @@ -59,14 +60,16 @@ func HandlerIndex(w http.ResponseWriter, r *http.Request) {
"method": "GET",
"description": "Indexer for comando torrents",
"query_params": map[string]string{
"q": "search query",
"q": "search query",
"filter_results": "if results with similarity equals to zero should be filtered (true/false)",
},
},
"/indexers/bludv": map[string]interface{}{
"method": "GET",
"description": "Indexer for bludv",
"query_params": map[string]string{
"q": "search query",
"q": "search query",
"filter_results": "if results with similarity equals to zero should be filtered (true/false)",
},
},
},
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@ require (

require (
github.com/PuerkitoBio/goquery v1.9.1
github.com/hbollon/go-edlib v1.6.0
github.com/prometheus/client_golang v1.19.0
)
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@ github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/hbollon/go-edlib v1.6.0 h1:ga7AwwVIvP8mHm9GsPueC0d71cfRU/52hmPJ7Tprv4E=
github.com/hbollon/go-edlib v1.6.0/go.mod h1:wnt6o6EIVEzUfgbUZY7BerzQ2uvzp354qmS2xaLkrhM=
github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU=
github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k=
github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos=
Expand Down
12 changes: 12 additions & 0 deletions utils/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package utils

func Filter[A any](arr []A, f func(A) bool) []A {
var res []A
res = make([]A, 0)
for _, v := range arr {
if f(v) {
res = append(res, v)
}
}
return res
}

0 comments on commit 268ece5

Please sign in to comment.