Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ecosystem specific inclusions or exclusions #1550

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions scanpipe/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# SPDX-License-Identifier: Apache-2.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO we need a better name as this is not a general purpose configuration. It could be either:

  • ecosystem_config.py
  • or better d2d_config.py as this is really focused on d2d for now, and may even need to be brought down one dir under /pipelines

#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.


class EcosystemConfig:
"""
Base class for ecosystem specific configurations to be defined
for each ecosystems.
"""

# This should be defined for each ecosystem which
# are options in the pipelines
ecosystem_option = None

# These are extensions for packages of this ecosystem which
# needs to be matched from purldb
purldb_package_extensions = []

# These are extensions for resources of this ecosystem which
# needs to be macthed from purldb
purldb_resource_extensions = []

# Extensions for document files which do not require review
doc_extensions = []

# Paths in the deployed binaries/archives (on the to/ side) which
# do not need review even if they are not matched to the source side
deployed_resource_path_exclusions = []

# Paths in the developement/source archive (on the from/ side) which
# should not be considered even if unmapped to the deployed side when
# assesing what to review on the deployed side
devel_resource_path_exclusions = []

# Symbols which are found in ecosystem specific standard libraries
# which are not so useful in mapping
standard_symbols_to_exclude = []


class DefaultEcosystemConfig(EcosystemConfig):
"""Configurations which are common across multiple ecosystems."""

ecosystem_option = "Default"
purldb_package_extensions = [".zip", ".tar.gz", ".tar.xz"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks fairly limited but OK as a default.

devel_resource_path_exclusions = ["*/tests/*"]
doc_extensions = [
".pdf",
".doc",
".docx",
".ppt",
".pptx",
".tex",
".odt",
".odp",
]


class JavaEcosystemConfig(EcosystemConfig):
ecosystem_option = "Java"
purldb_package_extensions = [".jar", ".war"]
purldb_resource_extensions = [".class"]


class JavaScriptEcosystemConfig(EcosystemConfig):
ecosystem_option = "JavaScript"
purldb_resource_extensions = [
".map",
".js",
".mjs",
".ts",
".d.ts",
".jsx",
".tsx",
".css",
".scss",
".less",
".sass",
".soy",
]


class GoEcosystemConfig(EcosystemConfig):
ecosystem_option = "Go"
purldb_resource_extensions = [".go"]


class RustEcosystemConfig(EcosystemConfig):
ecosystem_option = "Rust"
purldb_resource_extensions = [".rs"]


class RubyEcosystemConfig(EcosystemConfig):
ecosystem_option = "Ruby"
purldb_package_extensions = [".gem"]
purldb_resource_extensions = [".rb"]
deployed_resource_path_exclusions = ["*checksums.yaml.gz*", "*metadata.gz*"]
5 changes: 4 additions & 1 deletion scanpipe/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ def flag_ignored_resources(self):
ignored_patterns = ignored_patterns.splitlines()
ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS)

flag.flag_ignored_patterns(self.project, patterns=ignored_patterns)
flag.flag_ignored_patterns(
codebaseresources=self.project.codebaseresources.no_status(),
patterns=ignored_patterns,
)

def extract_archive(self, location, target):
"""Extract archive at `location` to `target`. Save errors as messages."""
Expand Down
47 changes: 19 additions & 28 deletions scanpipe/pipelines/deploy_to_develop.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from aboutcode.pipeline import optional_step
from scanpipe import pipes
from scanpipe.config import DefaultEcosystemConfig
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import d2d
from scanpipe.pipes import flag
Expand All @@ -31,7 +32,7 @@
from scanpipe.pipes import scancode


class DeployToDevelop(Pipeline):
class DeployToDevelop(Pipeline, DefaultEcosystemConfig):
"""
Establish relationships between two code trees: deployment and development.

Expand Down Expand Up @@ -64,6 +65,8 @@ def steps(cls):
cls.flag_empty_files,
cls.flag_whitespace_files,
cls.flag_ignored_resources,
cls.load_ecosystem_config,
cls.load_ecosystem_config_ruby,
cls.map_about_files,
cls.map_checksum,
cls.match_archives_to_purldb,
Expand Down Expand Up @@ -91,33 +94,6 @@ def steps(cls):
cls.create_local_files_packages,
)

purldb_package_extensions = [".jar", ".war", ".zip"]
purldb_resource_extensions = [
".map",
".js",
".mjs",
".ts",
".d.ts",
".jsx",
".tsx",
".css",
".scss",
".less",
".sass",
".soy",
".class",
]
doc_extensions = [
".pdf",
".doc",
".docx",
".ppt",
".pptx",
".tex",
".odt",
".odp",
]

def get_inputs(self):
"""Locate the ``from`` and ``to`` input files."""
self.from_files, self.to_files = d2d.get_inputs(self.project)
Expand Down Expand Up @@ -152,6 +128,15 @@ def flag_whitespace_files(self):
"""Flag whitespace files with size less than or equal to 100 byte as ignored."""
d2d.flag_whitespace_files(project=self.project)

def load_ecosystem_config(self):
"""Load ecosystem specific configurations for d2d steps for selected options."""
d2d.load_ecosystem_config(pipeline=self, options=self.selected_groups)

@optional_step("Ruby")
def load_ecosystem_config_ruby(self):
"""Load Ruby specific configurations for d2d steps."""
pass

def map_about_files(self):
"""Map ``from/`` .ABOUT files to their related ``to/`` resources."""
d2d.map_about_files(project=self.project, logger=self.log)
Expand Down Expand Up @@ -268,6 +253,7 @@ def flag_mapped_resources_archives_and_ignored_directories(self):
def perform_house_keeping_tasks(self):
"""
On deployed side
- Ignore specific files based on ecosystem based configurations.
- PurlDB match files with ``no-java-source`` and empty status,
if no match is found update status to ``requires-review``.
- Update status for uninteresting files.
Expand All @@ -278,6 +264,11 @@ def perform_house_keeping_tasks(self):
"""
d2d.match_resources_with_no_java_source(project=self.project, logger=self.log)
d2d.handle_dangling_deployed_legal_files(project=self.project, logger=self.log)
d2d.ignore_unmapped_resources_from_config(
project=self.project,
patterns_to_ignore=self.deployed_resource_path_exclusions,
logger=self.log,
)
d2d.match_unmapped_resources(
project=self.project,
matched_extensions=self.purldb_resource_extensions,
Expand Down
74 changes: 74 additions & 0 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from summarycode.classify import LEGAL_STARTS_ENDS

from aboutcode.pipeline import LoopProgress
from scanpipe import config
from scanpipe import pipes
from scanpipe.models import CodebaseRelation
from scanpipe.models import CodebaseResource
Expand All @@ -66,6 +67,16 @@
TO = "to/"


ECOSYSTEM_CONFIGS = [
config.DefaultEcosystemConfig,
config.JavaEcosystemConfig,
config.JavaScriptEcosystemConfig,
config.RubyEcosystemConfig,
config.RustEcosystemConfig,
config.GoEcosystemConfig,
]


def get_inputs(project):
"""
Locate the ``from`` and ``to`` input files in project inputs/ directory.
Expand Down Expand Up @@ -114,6 +125,55 @@ def get_best_path_matches(to_resource, matches):
return matches


def load_ecosystem_config(pipeline, options):
"""
Add ecosystem specific configurations for each ecosystem selected
as `options` to the `pipeline`.
"""
configs_by_ecosystem = {
ecosystem.ecosystem_option: ecosystem for ecosystem in ECOSYSTEM_CONFIGS
}

# Add default configurations which are common accross ecosystems
add_ecosystem_config(
pipeline=pipeline,
configs_by_ecosystem=configs_by_ecosystem,
selected_option="Default",
)

# Add configurations for each selected ecosystem
for selected_option in options:
if selected_option not in configs_by_ecosystem:
continue

add_ecosystem_config(
pipeline=pipeline,
configs_by_ecosystem=configs_by_ecosystem,
selected_option=selected_option,
)


def add_ecosystem_config(pipeline, configs_by_ecosystem, selected_option):
d2d_pipeline_configs = [
"purldb_package_extensions",
"purldb_resource_extensions",
"deployed_resource_path_exclusions",
]

ecosystem_config = configs_by_ecosystem.get(selected_option)

for pipeline_config in d2d_pipeline_configs:
config_value = getattr(ecosystem_config, pipeline_config)
pipeline_config_value = getattr(pipeline, pipeline_config)
if config_value:
if not pipeline_config_value:
new_config_value = config_value
else:
new_config_value = pipeline_config_value.extend(config_value)

setattr(pipeline, pipeline_config, new_config_value)


def get_from_files_for_scanning(resources):
"""
Return resources in the "from/" side which has been mapped to the "to/"
Expand Down Expand Up @@ -1453,6 +1513,20 @@ def match_resources_with_no_java_source(project, logger=None):
)


def ignore_unmapped_resources_from_config(project, patterns_to_ignore, logger=None):
"""Ignore unmapped resources for a project using `patterns_to_ignore`."""
ignored_resources_count = flag.flag_ignored_patterns(
codebaseresources=project.codebaseresources.to_codebase().no_status(),
patterns=patterns_to_ignore,
status=flag.IGNORED_FROM_CONFIG,
)
if logger:
logger(
f"Ignoring {ignored_resources_count:,d} to/ resources with "
"from ecosystem specific configurations."
)


def match_unmapped_resources(project, matched_extensions=None, logger=None):
"""
Match resources with empty status to PurlDB, if unmatched
Expand Down
7 changes: 4 additions & 3 deletions scanpipe/pipes/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
IGNORED_DEFAULT_IGNORES = "ignored-default-ignores"
IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues"
IGNORED_DOC_FILE = "ignored-doc-file"
IGNORED_FROM_CONFIG = "ignored-from-config"

COMPLIANCE_LICENSES = "compliance-licenses"
COMPLIANCE_SOURCEMIRROR = "compliance-sourcemirror"
Expand Down Expand Up @@ -89,15 +90,15 @@ def flag_ignored_directories(project):
return qs.update(status=IGNORED_DIRECTORY)


def flag_ignored_patterns(project, patterns):
def flag_ignored_patterns(codebaseresources, patterns, status=IGNORED_PATTERN):
"""Flag codebase resource as ``ignored`` status from list of ``patterns``."""
if isinstance(patterns, str):
patterns = patterns.splitlines()

update_count = 0
for pattern in patterns:
qs = project.codebaseresources.no_status().path_pattern(pattern)
update_count += qs.update(status=IGNORED_PATTERN)
qs = codebaseresources.path_pattern(pattern)
update_count += qs.update(status=status)

return update_count

Expand Down
7 changes: 5 additions & 2 deletions scanpipe/tests/pipes/test_flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ def test_scanpipe_pipes_flag_flag_ignored_directories(self):

def test_scanpipe_pipes_flag_flag_ignored_patterns(self):
patterns = ["*.ext", "dir/*"]
updated = flag.flag_ignored_patterns(self.project1, patterns)
updated = flag.flag_ignored_patterns(
self.project1.codebaseresources.no_status(), patterns
)

self.assertEqual(3, updated)
self.resource1.refresh_from_db()
Expand All @@ -85,7 +87,8 @@ def test_scanpipe_pipes_flag_flag_ignored_patterns(self):
make_resource_file(self.project1, "path/deeper/policies.yml")
make_resource_file(self.project1, "path/other-policies.yml")
updated = flag.flag_ignored_patterns(
self.project1, flag.DEFAULT_IGNORED_PATTERNS
self.project1.codebaseresources.no_status(),
flag.DEFAULT_IGNORED_PATTERNS,
)
self.assertEqual(3, updated)

Expand Down
5 changes: 4 additions & 1 deletion scanpipe/tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,10 @@ def test_scanpipe_pipeline_class_flag_ignored_resources(self):
mock_flag.return_value = None
pipeline.flag_ignored_resources()
patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS]
mock_flag.assert_called_with(project1, patterns=patterns_args)
mock_flag.assert_called_with(
codebaseresources=project1.codebaseresources.no_status(),
patterns=patterns_args,
)

def test_scanpipe_pipeline_class_extract_archive(self):
project1 = Project.objects.create(name="Analysis")
Expand Down
Loading