Skip to content

gh scraper: improve logging and scrape everything (beyond configured date range) #112

gh scraper: improve logging and scrape everything (beyond configured date range)

gh scraper: improve logging and scrape everything (beyond configured date range) #112

name: Scraper Dry Run & Validate Schema
on:
workflow_dispatch:
pull_request:
paths:
- scraper/**
- tests/**
- schemas/**
jobs:
test-run-github-scraper:
name: Test run GitHub Scraper
runs-on: ubuntu-latest
permissions:
issues: read
pull-requests: read
env:
DATA_REPO: ./
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: "20.14.0"
- name: Install pnpm
run: npm install -g pnpm
- name: Install dependencies
run: pnpm install --frozen-lockfile
working-directory: scraper
- name: Build the project
run: pnpm build
working-directory: scraper
- name: Scrape data from GitHub
run: pnpm start ${{ github.repository_owner }} ../data/github
working-directory: scraper
env:
GITHUB_TOKEN: ${{ secrets.SCRAPER_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}
PROJECTS_BOARD_ID: PVT_kwDOA7JD8804yA
- run: mkdir contributors
- name: Generate markdown files for new contributors
run: node scripts/generateNewContributors.js
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- uses: actions/upload-artifact@v4
with:
name: output
retention-days: 5
path: |
data
contributors
- name: Get pnpm store directory
shell: bash
run: |
echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_ENV
- uses: actions/cache@v4
name: Setup pnpm cache
with:
path: ${{ env.STORE_PATH }}
key: ${{ runner.os }}-pnpm-store-${{ hashFiles('**/pnpm-lock.yaml') }}
restore-keys: |
${{ runner.os }}-pnpm-store-
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Run Tests
run: pnpm test