8000 Add base pipeline for importers and migrate PyPa importer to aboutcode pipeline by keshav-space · Pull Request #1559 · aboutcode-org/vulnerablecode · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Add base pipeline for importers and migrate PyPa importer to aboutcode pipeline #1559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion vulnerabilities/import_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import Importer
from vulnerabilities.importers import IMPORTERS_REGISTRY
from vulnerabilities.improver import Inference
from vulnerabilities.improvers.default import DefaultImporter
from vulnerabilities.models import Advisory
Expand Down
4 changes: 2 additions & 2 deletions vulnerabilities/importers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from vulnerabilities.importers import oss_fuzz
from vulnerabilities.importers import postgresql
from vulnerabilities.importers import project_kb_msr2019
from vulnerabilities.importers import pypa
from vulnerabilities.importers import pysec
from vulnerabilities.importers import redhat
from vulnerabilities.importers import retiredotnet
Expand All @@ -40,13 +39,13 @@
from vulnerabilities.importers import ubuntu_usn
from vulnerabilities.importers import vulnrichment
from vulnerabilities.importers import xen
from vulnerabilities.pipelines import pypa_importer

IMPORTERS_REGISTRY = [
nvd.NVDImporter,
github.GitHubAPIImporter,
gitlab.GitLabAPIImporter,
npm.NpmImporter,
pypa.PyPaImporter,
nginx.NginxImporter,
pysec.PyPIImporter,
alpine_linux.AlpineImporter,
Expand Down Expand Up @@ -75,6 +74,7 @@
github_osv.GithubOSVImporter,
epss.EPSSImporter,
vulnrichment.VulnrichImporter,
pypa_importer.PyPaImporterPipeline,
]

IMPORTERS_REGISTRY = {x.qualified_name: x for x in IMPORTERS_REGISTRY}
66 changes: 0 additions & 66 deletions vulnerabilities/importers/pypa.py

This file was deleted.

8 changes: 8 additions & 0 deletions vulnerabilities/management/commands/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from vulnerabilities.import_runner import ImportRunner
from vulnerabilities.importers import IMPORTERS_REGISTRY
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline


class Command(BaseCommand):
Expand Down Expand Up @@ -57,6 +58,13 @@ def import_data(self, importers):

for importer in importers:
self.stdout.write(f"Importing data using {importer.qualified_name}")
if issubclass(importer, VulnerableCodeBaseImporterPipeline):
status, error = importer().execute()
if status != 0:
self.stdout.write(error)
failed_importers.append(importer.qualified_name)
continue

try:
ImportRunner(importer).run()
self.stdout.write(
Expand Down
98 changes: 97 additions & 1 deletion vulnerabilities/pipelines/__init__.py
5D39
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,24 @@
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/vulnerablecode for support or download.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging
from datetime import datetime
from datetime import timezone
from traceback import format_exc as traceback_format_exc
from typing import Iterable

from aboutcode.pipeline import BasePipeline
from aboutcode.pipeline import LoopProgress

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.improver import MAX_CONFIDENCE
from vulnerabilities.models import Advisory
from vulnerabilities.pipes.advisory import import_advisory
from vulnerabilities.pipes.advisory import insert_advisory
from vulnerabilities.utils import classproperty

module_logger = logging.getLogger(__name__)
Expand All @@ -32,3 +41,90 @@ def qualified_name(cls):
Fully qualified name prefixed with the module name of the pipeline used in logging.
"""
return f"{cls.__module__}.{cls.__qualname__}"


class VulnerableCodeBaseImporterPipeline(VulnerableCodePipeline):
"""
Base importer pipeline for importing advisories.

Uses:
Subclass this Pipeline and implement ``advisories_count`` and ``collect_advisories`` method.
Also override the ``steps`` and ``advisory_confidence`` as needed.
"""

license_url = None
spdx_license_expression = None
repo_url = None
importer_name = None
advisory_confidence = MAX_CONFIDENCE

@classmethod
def steps(cls):
return (
# Add step for downloading/cloning resource as required.
cls.collect_and_store_advisories,
cls.import_new_advisories,
# Add step for removing downloaded/cloned resource as required.
)

def collect_advisories(self) -> Iterable[AdvisoryData]:
"""
Yield AdvisoryData for importer pipeline.

Populate the `self.collected_advisories_count` field and yield AdvisoryData
"""
raise NotImplementedError

def advisories_count(self) -> int:
"""
Return the estimated AdvisoryData to be yielded by ``collect_advisories``.

Used by ``collect_and_store_advisories`` to log the progress of advisory collection.
"""
raise NotImplementedError

def collect_and_store_advisories(self):
collected_advisory_count = 0
progress = LoopProgress(total_iterations=self.advisories_count(), logger=self.log)
for advisory in progress.iter(self.collect_advisories()):
if _obj := insert_advisory(
advisory=advisory,
pipeline_name=self.qualified_name,
logger=self.log,
):
collected_advisory_count += 1

self.log(f"Successfully collected {collected_advisory_count:,d} advisories")

def import_new_advisories(self):
new_advisories = Advisory.objects.filter(
created_by=self.qualified_name,
date_imported__isnull=True,
)

new_advisories_count = new_advisories.count()

self.log(f"Importing {new_advisories_count:,d} new advisories")

imported_advisory_count = 0
progress = LoopProgress(total_iterations=new_advisories_count, logger=self.log)
for advisory in progress.iter(new_advisories.paginated()):
self.import_advisory(advisory=advisory)
if advisory.date_imported:
imported_advisory_count += 1

self.log(f"Successfully imported {imported_advisory_count:,d} new advisories")

def import_advisory(self, advisory: Advisory) -> int:
try:
import_advisory(
advisory=advisory,
pipeline_name=self.qualified_name,
confidence=self.advisory_confidence,
logger=self.log,
)
except Exception as e:
self.log(
f"Failed to import advisory: {advisory!r} with error {e!r}:\n{traceback_format_exc()}",
level=logging.ERROR,
)
70 changes: 70 additions & 0 deletions vulnerabilities/pipelines/pypa_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import logging
from pathlib import Path
from typing import Iterable

import saneyaml
from fetchcode.vcs import fetch_via_vcs

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importers.osv import parse_advisory_data
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline
from vulnerabilities.utils import get_advisory_url

module_logger = logging.getLogger(__name__)


class PyPaImporterPipeline(VulnerableCodeBaseImporterPipeline):
"""Collect advisories from PyPA GitHub repository."""

spdx_license_expression = "CC-BY-4.0"
license_url = "https://github.com/pypa/advisory-database/blob/main/LICENSE"
repo_url = "git+https://github.com/pypa/advisory-database"
importer_name = "Pypa Importer"

@classmethod
def steps(cls):
return (
cls.clone,
cls.collect_and_store_advisories,
cls.import_new_advisories,
cls.clean_downloads,
)

def clone(self):
self.log(f"Cloning `{self.repo_url}`")
self.vcs_response = fetch_via_vcs(self.repo_url)

def advisories_count(self):
vulns_directory = Path(self.vcs_response.dest_dir) / "vulns"
return sum(1 for _ in vulns_directory.rglob("*.yaml"))

def collect_advisories(self) -> Iterable[AdvisoryData]:
base_directory = Path(self.vcs_response.dest_dir)
vulns_directory = base_directory / "vulns"
self.advisories_count = sum(1 for _ in vulns_directory.rglob("*.yaml"))

for advisory in vulns_directory.rglob("*.yaml"):
advisory_url = get_advisory_url(
file=advisory,
base_path=base_directory,
url="https://github.com/pypa/advisory-database/blob/main/",
)
advisory_dict = saneyaml.load(advisory.read_text())
yield parse_advisory_data(
raw_data=advisory_dict,
supported_ecosystems=["pypi"],
advisory_url=advisory_url,
)

def clean_downloads(self):
if self.vcs_response:
self.log(f"Removing cloned repository")
self.vcs_response.delete()
Loading
Loading
0