Skip to content

Commit

Permalink
Merge pull request #33 from tech-engine/dev
Browse files Browse the repository at this point in the history
Update github actions and project readme docs
  • Loading branch information
tech-engine authored Nov 21, 2024
2 parents e89a35c + 18b77e3 commit 8c8e064
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 30 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: Test on Pull Request

on:
push:
branches:
- 'main'
pull_request:
branches:
- 'main'
Expand Down
85 changes: 55 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,53 +50,78 @@ This will create a new project directory with all the files necessary to begin w
✨ Congrates. books_to_scrape created successfully.
```

### main.go
In your __`main.go`__ file, set up and execute your spider.
### spider.go
In your __`spider.go`__ file, set up and execute your spider.

For detailed code, please refer to the [sample code here](./_examples/books.toscrape.com/main.go).
For detailed code, please refer to the [sample code here](./_examples/scrapejsp_method2/scrapejsp/spider.go).

```go
package main
package scrapejsp

import (
"context"
"errors"
"encoding/json"
"fmt"
"os"
"os/signal"
"books_to_scrape/books_to_scrape"
"sync"
"syscall"
"log"

"github.com/tech-engine/goscrapy/cmd/gos"
"github.com/tech-engine/goscrapy/pkg/core"
)

func main() {
ctx, cancel := context.WithCancel(context.Background())
type Spider struct {
gos.ICoreSpider[*Record]
}

func NewSpider(ctx context.Context) (*Spider, <-chan error) {

var wg sync.WaitGroup
wg.Add(1)
// use proxies
// proxies := core.WithProxies("proxy_url1", "proxy_url2", ...)
// core := gos.New[*Record]().WithClient(
// gos.DefaultClient(proxies),
// )

spider, errCh := books_to_scrape.New(ctx)

go func() {
defer wg.Done()
core := gos.New[*Record]()

err := <-errCh
// Add middlewares
core.MiddlewareManager.Add(MIDDLEWARES...)
// Add pipelines
core.PipelineManager.Add(PIPELINES...)

if err != nil && errors.Is(err, context.Canceled) {
return
}
errCh := make(chan error)

fmt.Printf("failed: %q", err)
go func() {
errCh <- core.Start(ctx)
}()

// trigger the Start Request
spider.StartRequest(ctx, nil)
return &Spider{
core,
}, errCh
}

// This is the entrypoint to the spider
func (s *Spider) StartRequest(ctx context.Context, job *Job) {

req := s.NewRequest()
// req.Meta("JOB", job)
req.Url("https://jsonplaceholder.typicode.com/todos/1")

s.Request(req, s.parse)
}

func (s *Spider) Close(ctx context.Context) {
}

func (s *Spider) parse(ctx context.Context, resp core.IResponseReader) {
fmt.Printf("status: %d", resp.StatusCode())

var data Record
err := json.Unmarshal(resp.Bytes(), &data)
if err != nil {
log.Fatalln(err)
}

OnTerminate(func() {
fmt.Println("exit signal received: shutting down gracefully")
cancel()
wg.Wait()
})
// to push to pipelines
s.Yield(&data)
}
```

Expand Down

0 comments on commit 8c8e064

Please sign in to comment.