Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add: npm ownership intermediate model #2520

Merged
merged 6 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions warehouse/dbt/macros/models/parse_npm_git_url.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{% macro parse_npm_git_url(key, source) %}

select
*,

case
when regexp_contains({{ key }}, r'^git\+ssh://') then
regexp_replace({{ key }}, r'^git\+ssh://([^@]+)@', 'https://')
when regexp_contains({{ key }}, r'^git@') then
regexp_replace({{ key }}, r'^git@(.*?):', 'https://\\1/')
when regexp_contains({{ key }}, r'^git\+https://') then
regexp_replace({{ key }}, r'^git\+', '')
when regexp_contains({{ key }}, r'^https?://') then
{{ key }}
when regexp_contains({{ key }}, r'^[^:/]+\.[^:/]+/') then
concat('https://', {{ key }})
else null
end as remote_url,

regexp_extract(
case
when regexp_contains({{ key }}, r'\.git$') then
regexp_replace({{ key }}, r'\.git$', '')
else {{ key }}
end,
r'/([^/]+)$'
) as remote_name,

regexp_extract(
case
when regexp_contains({{ key }}, r'^git@') then
regexp_replace({{ key }}, r'^git@(.*?):', 'https://\\1/')
when regexp_contains({{ key }}, r'^git\+ssh://') then
regexp_replace({{ key }}, r'^git\+ssh://', 'https://')
else {{ key }}
end,
r'https?:\/\/[^\/]+\/([^\/]+)\/[^\/]+$'
) as remote_namespace,

case
when regexp_contains({{ key }}, r'github\.com') then 'GITHUB'
when regexp_contains({{ key }}, r'gitlab\.com') then 'GITLAB'
when regexp_contains({{ key }}, r'bitbucket\.org') then 'BITBUCKET'
else 'OTHER'
end as remote_source_id

from {{ source }}

{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
with npm_artifacts as (
select artifact_name
from {{ ref('artifacts_v1') }}
where artifact_source = 'NPM'
),

npm_manifests as (
select
`name`,
repository__url,
repository__type,
concat('https://www.npmjs.com/package/', `name`) as artifact_url
from {{ ref('stg_npm__manifests') }}
where
`name` in (select * from npm_artifacts)
and repository__url is not null
),

npm_repository_urls as (
{{ parse_npm_git_url('repository__url', 'npm_manifests') }}
),

npm_artifact_ownership as (
select
{{ oso_id(
"'NPM'",
"artifact_url",
) }} as artifact_id,
artifact_url,
`name` as artifact_name,
'NPM' as artifact_source_id,
remote_url,
remote_name,
remote_namespace,
remote_source_id
from npm_repository_urls
)

select * from npm_artifact_ownership
9 changes: 9 additions & 0 deletions warehouse/dbt/models/npm_sources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
sources:
- name: npm
database: opensource-observer
schema: npm
tables:
- name: downloads
identifier: downloads
- name: manifests
identifier: manifests
9 changes: 9 additions & 0 deletions warehouse/dbt/models/staging/npm/stg_npm__downloads.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
with source as (
select
`date`,
artifact_name,
downloads
from {{ source('npm', 'downloads') }}
)

select * from source
25 changes: 25 additions & 0 deletions warehouse/dbt/models/staging/npm/stg_npm__manifests.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{% set columns = [
"name", "version", "description", "keywords", "homepage", "bugs",
"license", "author", "contributors", "funding", "files", "exports",
"main", "browser", "bin", "man", "directories", "repository",
"scripts", "config", "dependencies", "dev_dependencies",
"peer_dependencies", "peer_dependencies_meta", "bundle_dependencies",
"optional_dependencies", "overrides", "engines", "os", "cpu",
"dev_engines", "private", "publish_config", "workspaces", "bugs__url",
"repository__url", "repository__type", "author__url", "author__name",
"author__email"
] %}

with source as (
select * from {{ source('npm', 'manifests') }}
),

renamed as (
select
{% for column in columns %}
{{ adapter.quote(column) }}{% if not loop.last %},{% endif %}
{% endfor %}
from source
)

select * from renamed
150 changes: 139 additions & 11 deletions warehouse/oso_dagster/assets/npm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime, timedelta
from typing import Generator, List, Optional
from typing import Dict, Generator, List, Optional, Union

import dlt
import requests
Expand All @@ -9,22 +9,61 @@

from ..factories.dlt import dlt_factory, pydantic_to_dlt_nullable_columns

# Host for the NPM API
NPM_API_HOST = "https://api.npmjs.org"

# Host for the NPM registry
NPM_HOST = "https://api.npmjs.org"
NPM_REGISTRY_HOST = "https://registry.npmjs.org"

# NPM was launched on January 12, 2010
NPM_EPOCH = "2010-01-12T00:00:00Z"
# https://github.com/npm/registry/blob/main/docs/download-counts.md#limits
NPM_EPOCH = "2015-01-10T00:00:00Z"


class NPMPackageInfo(BaseModel):
class NPMPackageDownloadInfo(BaseModel):
date: datetime
artifact_name: str
downloads: int


class NPMPackageManifest(BaseModel):
name: Optional[str] = None
version: Optional[str] = None
description: Optional[str] = None
keywords: Optional[List] = None
homepage: Optional[str] = None
bugs: Optional[Union[str, Dict]] = None
license: Optional[str] = None
author: Optional[Union[str, Dict]] = None
contributors: Optional[List] = None
funding: Optional[Union[str, Dict, List]] = None
files: Optional[List] = None
exports: Optional[Dict] = None
main: Optional[str] = None
browser: Optional[bool] = None
man: Optional[Union[str, Dict, List]] = None
directories: Optional[Dict] = None
repository: Optional[Union[str, Dict]] = None
scripts: Optional[Dict] = None
config: Optional[Dict] = None
dependencies: Optional[Dict] = None
devDependencies: Optional[Dict] = None
peerDependencies: Optional[Dict] = None
peerDependenciesMeta: Optional[Dict] = None
bundleDependencies: Optional[List] = None
optionalDependencies: Optional[Dict] = None
overrides: Optional[Dict] = None
engines: Optional[Dict] = None
os: Optional[List] = None
cpu: Optional[List] = None
devEngines: Optional[Dict] = None
private: Optional[bool] = None
publishConfig: Optional[Dict] = None
workspaces: Optional[List] = None


def get_npm_package_downloads(
package_name: str, date_from: datetime, date_to: datetime
) -> Generator[Optional[NPMPackageInfo], None, None]:
) -> Generator[Optional[NPMPackageDownloadInfo], None, None]:
"""
Fetches the download count for an NPM package between two dates.

Expand All @@ -34,13 +73,13 @@ def get_npm_package_downloads(
date_to (datetime): The end date

Yields:
Optional[NPMPackageInfo]: The download count for the package
Optional[NPMPackageDownloadInfo]: The download count for the package
"""

str_from = date_from.strftime("%Y-%m-%d")
str_to = date_to.strftime("%Y-%m-%d")

endpoint = f"{NPM_HOST}/downloads/range/{str_from}:{str_to}/{package_name}"
endpoint = f"{NPM_API_HOST}/downloads/range/{str_from}:{str_to}/{package_name}"
response = requests.get(
endpoint,
timeout=10,
Expand Down Expand Up @@ -89,7 +128,7 @@ def get_npm_package_downloads(
total_downloads = sum(download["downloads"] for download in data["downloads"])

yield (
NPMPackageInfo(
NPMPackageDownloadInfo(
date=date_from,
artifact_name=package_name,
downloads=total_downloads,
Expand All @@ -99,10 +138,44 @@ def get_npm_package_downloads(
)


def get_npm_package_manifest(
package_name: str,
) -> Generator[Optional[NPMPackageManifest], None, None]:
"""
Fetches the manifest for an NPM package.

Args:
context (AssetExecutionContext): The asset execution context
package_name (str): The NPM package name

Yields:
Optional[NPMPackageManifest]: The manifest for the package
"""

endpoint = f"{NPM_REGISTRY_HOST}/{package_name}/latest"
response = requests.get(
endpoint,
timeout=10,
headers={
"X-URL": "https://github.com/opensource-observer/oso",
"X-Contact": "[email protected]",
"X-Purpose": "We are currently indexing NPM packages to provide dependency statistics. "
"If you have any questions or concerns, please contact us",
},
)

data = response.json()

if not response.ok:
raise ValueError(f"Failed to fetch data for {package_name}: {response.text}")

yield NPMPackageManifest(**data)


@dlt.resource(
primary_key="artifact_name",
name="downloads",
columns=pydantic_to_dlt_nullable_columns(NPMPackageInfo),
columns=pydantic_to_dlt_nullable_columns(NPMPackageDownloadInfo),
)
def get_all_downloads(
context: AssetExecutionContext,
Expand All @@ -117,7 +190,7 @@ def get_all_downloads(
package_names (List[str]): List of NPM package names to fetch

Yields:
List[NPMPackageInfo]: The download count for each package
List[NPMPackageDownloadInfo]: The download count for each package
"""

start = datetime.strptime(context.partition_key, "%Y-%m-%d")
Expand All @@ -134,6 +207,33 @@ def get_all_downloads(
)


@dlt.resource(
primary_key="name",
name="manifests",
columns=pydantic_to_dlt_nullable_columns(NPMPackageManifest),
)
def get_all_manifests(
context: AssetExecutionContext,
package_names: List,
):
"""
Fetches the manifest for a list of NPM packages.

Args:
context (AssetExecutionContext): The asset execution
package_names (List): List of NPM package names to fetch

Yields:
List[NPMPackageManifest]: The manifest for each package
"""

context.log.info(f"Processing NPM manifests for {len(package_names)} packages")

yield from (
get_npm_package_manifest(package_name) for package_name in package_names
)


@dlt_factory(
key_prefix="npm",
partitions_def=WeeklyPartitionsDefinition(
Expand Down Expand Up @@ -164,3 +264,31 @@ def downloads(
for row in client.query_with_string(unique_artifacts_query)
],
)


@dlt_factory(
key_prefix="npm",
deps=[AssetKey(["dbt", "production", "artifacts_v1"])],
)
def manifests(
context: AssetExecutionContext,
cbt: CBTResource,
):
unique_artifacts_query = """
SELECT
DISTINCT(artifact_name)
FROM
`oso.artifacts_v1`
WHERE
artifact_source = "NPM"
"""

client = cbt.get(context.log)

yield get_all_manifests(
context,
package_names=[
row["artifact_name"]
for row in client.query_with_string(unique_artifacts_query)
],
)