Skip to content

Commit

Permalink
fix: flatten npm manifest structure (#2524)
Browse files Browse the repository at this point in the history
* fix: flatten `npm` manifest structure

* fix: update `npm` manifests staging columns

* fix: update repository field references `int_artifact_ownership` model
  • Loading branch information
Jabolol authored Nov 26, 2024
1 parent edc0465 commit 7118352
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ with npm_artifacts as (
npm_manifests as (
select
`name`,
repository__url,
repository__type,
json_value(repository, '$.url') as manifest_repository_url,
json_value(repository, '$.type') as manifest_repository_type,
concat('https://www.npmjs.com/package/', `name`) as artifact_url
from {{ ref('stg_npm__manifests') }}
where
`name` in (select * from npm_artifacts)
and repository__url is not null
and json_value(repository, '$.url') is not null
),

npm_repository_urls as (
{{ parse_npm_git_url('repository__url', 'npm_manifests') }}
{{ parse_npm_git_url('manifest_repository_url', 'npm_manifests') }}
),

npm_artifact_ownership as (
Expand Down
17 changes: 8 additions & 9 deletions warehouse/dbt/models/staging/npm/stg_npm__manifests.sql
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
{% set columns = [
"name", "version", "description", "keywords", "homepage", "bugs",
"license", "author", "contributors", "funding", "files", "exports",
"main", "browser", "bin", "man", "directories", "repository",
"scripts", "config", "dependencies", "dev_dependencies",
"peer_dependencies", "peer_dependencies_meta", "bundle_dependencies",
"optional_dependencies", "overrides", "engines", "os", "cpu",
"dev_engines", "private", "publish_config", "workspaces", "bugs__url",
"repository__url", "repository__type", "author__url", "author__name",
"author__email"
"name", "version", "description", "keywords", "homepage", "bugs",
"license", "author", "contributors", "funding", "files", "exports",
"main", "browser", "bin", "man", "directories", "repository",
"scripts", "config", "dependencies", "dev_dependencies",
"peer_dependencies", "peer_dependencies_meta", "bundle_dependencies",
"optional_dependencies", "overrides", "engines", "os", "cpu",
"dev_engines", "private", "publish_config", "workspaces",
"_dlt_load_id", "_dlt_id"
] %}

with source as (
Expand Down
58 changes: 49 additions & 9 deletions warehouse/oso_dagster/assets/npm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime, timedelta
from typing import Dict, Generator, List, Optional, Union
from typing import Dict, Generator, List, Optional, Any

import dlt
import requests
Expand Down Expand Up @@ -31,18 +31,19 @@ class NPMPackageManifest(BaseModel):
description: Optional[str] = None
keywords: Optional[List] = None
homepage: Optional[str] = None
bugs: Optional[Union[str, Dict]] = None
license: Optional[str] = None
author: Optional[Union[str, Dict]] = None
bugs: Optional[Dict] = None
license: Optional[Dict] = None
author: Optional[Dict] = None
contributors: Optional[List] = None
funding: Optional[Union[str, Dict, List]] = None
funding: Optional[List] = None
files: Optional[List] = None
exports: Optional[Dict] = None
main: Optional[str] = None
browser: Optional[bool] = None
man: Optional[Union[str, Dict, List]] = None
browser: Optional[Dict] = None
bin: Optional[Dict] = None
man: Optional[List] = None
directories: Optional[Dict] = None
repository: Optional[Union[str, Dict]] = None
repository: Optional[Dict] = None
scripts: Optional[Dict] = None
config: Optional[Dict] = None
dependencies: Optional[Dict] = None
Expand All @@ -61,6 +62,45 @@ class NPMPackageManifest(BaseModel):
workspaces: Optional[List] = None


# Some fields in the NPM manifest are not always in the same format
# This dictionary contains the transformations to apply to the data
# before creating the manifest object
TRANSFORMATIONS = {
"bugs": lambda value: {"url": value} if isinstance(value, str) else value,
"license": lambda value: {"type": value} if isinstance(value, str) else value,
"author": lambda value: {"author": value} if isinstance(value, str) else value,
"funding": lambda value: (
[{"type": "url", "url": value}]
if isinstance(value, str)
else [value] if isinstance(value, dict) else value
),
"exports": lambda value: {".": value} if isinstance(value, str) else value,
"bin": lambda value: {"path": value} if isinstance(value, str) else value,
"man": lambda value: [value] if isinstance(value, str) else value,
"browser": lambda value: (
{"browser": value} if isinstance(value, (str, bool)) else value
),
"repository": lambda value: {"url": value} if isinstance(value, str) else value,
}


def flatten_manifest(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Applies transformations to the data before creating the manifest object.
Args:
data (Dict[str, Any]): The data to transform
Returns:
Dict[str, Any]: The transformed data
"""

for key, transform in TRANSFORMATIONS.items():
if key in data:
data[key] = transform(data[key])
return data


def get_npm_package_downloads(
package_name: str, date_from: datetime, date_to: datetime
) -> Generator[Optional[NPMPackageDownloadInfo], None, None]:
Expand Down Expand Up @@ -169,7 +209,7 @@ def get_npm_package_manifest(
if not response.ok:
raise ValueError(f"Failed to fetch data for {package_name}: {response.text}")

yield NPMPackageManifest(**data)
yield NPMPackageManifest(**flatten_manifest(data))


@dlt.resource(
Expand Down

0 comments on commit 7118352

Please sign in to comment.