From bcd01d0df263d3f5d590ed5b50d9b19a3ec6e026 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Fri, 6 Sep 2024 14:26:05 +0530 Subject: [PATCH 1/4] Migrate GitHub importer to aboutcode pipeline Signed-off-by: Keshav Priyadarshi --- vulnerabilities/importers/__init__.py | 4 +- vulnerabilities/improvers/valid_versions.py | 4 +- vulnerabilities/pipelines/__init__.py | 7 +- .../github_importer.py} | 225 +++++++++++------- .../test_github_importer_pipeline.py} | 61 +++-- vulnerabilities/tests/test_upstream.py | 2 +- 6 files changed, 192 insertions(+), 111 deletions(-) rename vulnerabilities/{importers/github.py => pipelines/github_importer.py} (56%) rename vulnerabilities/tests/{test_github.py => pipelines/test_github_importer_pipeline.py} (84%) diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 75d9e8bed..a69fe1629 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -19,7 +19,6 @@ from vulnerabilities.importers import epss from vulnerabilities.importers import fireeye from vulnerabilities.importers import gentoo -from vulnerabilities.importers import github from vulnerabilities.importers import github_osv from vulnerabilities.importers import istio from vulnerabilities.importers import mozilla @@ -38,6 +37,7 @@ from vulnerabilities.importers import vulnrichment from vulnerabilities.importers import xen from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline +from vulnerabilities.pipelines import github_importer from vulnerabilities.pipelines import gitlab_importer from vulnerabilities.pipelines import nginx_importer from vulnerabilities.pipelines import npm_importer @@ -45,7 +45,6 @@ IMPORTERS_REGISTRY = [ nvd.NVDImporter, - github.GitHubAPIImporter, pysec.PyPIImporter, alpine_linux.AlpineImporter, openssl.OpensslImporter, @@ -78,6 +77,7 @@ npm_importer.NpmImporterPipeline, nginx_importer.NginxImporterPipeline, gitlab_importer.GitLabImporterPipeline, + github_importer.GitHubAPIImporterPipeline, ] IMPORTERS_REGISTRY = { diff --git a/vulnerabilities/improvers/valid_versions.py b/vulnerabilities/improvers/valid_versions.py index e65b619ad..5d1e087ec 100644 --- a/vulnerabilities/improvers/valid_versions.py +++ b/vulnerabilities/improvers/valid_versions.py @@ -31,7 +31,6 @@ from vulnerabilities.importers.debian import DebianImporter from vulnerabilities.importers.debian_oval import DebianOvalImporter from vulnerabilities.importers.elixir_security import ElixirSecurityImporter -from vulnerabilities.importers.github import GitHubAPIImporter from vulnerabilities.importers.github_osv import GithubOSVImporter from vulnerabilities.importers.istio import IstioImporter from vulnerabilities.importers.oss_fuzz import OSSFuzzImporter @@ -42,6 +41,7 @@ from vulnerabilities.improver import Inference from vulnerabilities.models import Advisory from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline +from vulnerabilities.pipelines.github_importer import GitHubAPIImporterPipeline from vulnerabilities.pipelines.gitlab_importer import GitLabImporterPipeline from vulnerabilities.pipelines.nginx_importer import NginxImporterPipeline from vulnerabilities.pipelines.npm_importer import NpmImporterPipeline @@ -371,7 +371,7 @@ class GitLabBasicImprover(ValidVersionImprover): class GitHubBasicImprover(ValidVersionImprover): - importer = GitHubAPIImporter + importer = GitHubAPIImporterPipeline ignorable_versions = frozenset( [ "0.1-bulbasaur", diff --git a/vulnerabilities/pipelines/__init__.py b/vulnerabilities/pipelines/__init__.py index aa3d59d83..0d3589b67 100644 --- a/vulnerabilities/pipelines/__init__.py +++ b/vulnerabilities/pipelines/__init__.py @@ -89,7 +89,12 @@ def advisories_count(self) -> int: def collect_and_store_advisories(self): collected_advisory_count = 0 - progress = LoopProgress(total_iterations=self.advisories_count(), logger=self.log) + estimated_advisory_count = self.advisories_count() + + if estimated_advisory_count > 0: + self.log(f"Collecting {estimated_advisory_count:,d} advisories") + + progress = LoopProgress(total_iterations=estimated_advisory_count, logger=self.log) for advisory in progress.iter(self.collect_advisories()): if _obj := insert_advisory( advisory=advisory, diff --git a/vulnerabilities/importers/github.py b/vulnerabilities/pipelines/github_importer.py similarity index 56% rename from vulnerabilities/importers/github.py rename to vulnerabilities/pipelines/github_importer.py index c12c43044..d5df390b4 100644 --- a/vulnerabilities/importers/github.py +++ b/vulnerabilities/pipelines/github_importer.py @@ -8,9 +8,14 @@ # import logging +from traceback import format_exc as traceback_format_exc +from typing import Callable from typing import Iterable +from typing import List from typing import Optional +import requests +from bs4 import BeautifulSoup from cwe2.database import Database from dateutil import parser as dateparser from packageurl import PackageURL @@ -21,85 +26,120 @@ from vulnerabilities import utils from vulnerabilities.importer import AdvisoryData from vulnerabilities.importer import AffectedPackage -from vulnerabilities.importer import Importer from vulnerabilities.importer import Reference from vulnerabilities.importer import VulnerabilitySeverity +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline from vulnerabilities.utils import dedupe from vulnerabilities.utils import get_cwe_id from vulnerabilities.utils import get_item -logger = logging.getLogger(__name__) - -PACKAGE_TYPE_BY_GITHUB_ECOSYSTEM = { - "MAVEN": "maven", - "NUGET": "nuget", - "COMPOSER": "composer", - "PIP": "pypi", - "RUBYGEMS": "gem", - "NPM": "npm", - "RUST": "cargo", - # "GO": "golang", -} - -GITHUB_ECOSYSTEM_BY_PACKAGE_TYPE = { - value: key for (key, value) in PACKAGE_TYPE_BY_GITHUB_ECOSYSTEM.items() -} - -# TODO: We will try to gather more info from GH API -# Check https://github.com/nexB/vulnerablecode/issues/1039#issuecomment-1366458885 -# Check https://github.com/nexB/vulnerablecode/issues/645 -# set of all possible values of first '%s' = {'MAVEN','COMPOSER', 'NUGET', 'RUBYGEMS', 'PYPI', 'NPM', 'RUST'} -# second '%s' is interesting, it will have the value '' for the first request, -GRAPHQL_QUERY_TEMPLATE = """ -query{ - securityVulnerabilities(first: 100, ecosystem: %s, %s) { - edges { - node { - advisory { - identifiers { - type - value - } - summary - references { - url - } - severity - cwes(first: 10){ - nodes { - cweId + +class GitHubAPIImporterPipeline(VulnerableCodeBaseImporterPipeline): + """Collect GitHub advisories.""" + + spdx_license_expression = "CC-BY-4.0" + license_url = "https://github.com/github/advisory-database/blob/main/LICENSE.md" + importer_name = "GHSA Importer" + + @classmethod + def steps(cls): + return ( + cls.collect_and_store_advisories, + cls.import_new_advisories, + ) + + package_type_by_github_ecosystem = { + "MAVEN": "maven", + "NUGET": "nuget", + "COMPOSER": "composer", + "PIP": "pypi", + "RUBYGEMS": "gem", + "NPM": "npm", + "RUST": "cargo", + # "GO": "golang", + } + + github_ecosystem_by_package_type = { + value: key for (key, value) in package_type_by_github_ecosystem.items() + } + + def advisories_count(self): + normalized_github_ecosystems = [ + k.lower() for k in self.package_type_by_github_ecosystem.keys() + ] + + try: + response = requests.get("https://github.com/advisories") + response.raise_for_status() + except requests.HTTPError as http_err: + self.log( + f"HTTP error occurred: {http_err} \n {traceback_format_exc()}", + level=logging.ERROR, + ) + return 0 + + soup = BeautifulSoup(response.text, "html.parser") + advisory_counts = 0 + for li in soup.select("ul.filter-list li") or []: + if link := li.find("a", class_="filter-item"): + ecosystem, _, _ = link.text.strip().rpartition(" ") + if count_span := li.find("span", class_="count"): + count = int(count_span.text.strip().replace(",", "")) + ecosystem = ecosystem.strip().lower() + if ecosystem in normalized_github_ecosystems: + advisory_counts += count + + return advisory_counts + + def collect_advisories(self) -> Iterable[AdvisoryData]: + + # TODO: We will try to gather more info from GH API + # Check https://github.com/nexB/vulnerablecode/issues/1039#issuecomment-1366458885 + # Check https://github.com/nexB/vulnerablecode/issues/645 + # set of all possible values of first '%s' = {'MAVEN','COMPOSER', 'NUGET', 'RUBYGEMS', 'PYPI', 'NPM', 'RUST'} + # second '%s' is interesting, it will have the value '' for the first request, + advisory_query = """ + query{ + securityVulnerabilities(first: 100, ecosystem: %s, %s) { + edges { + node { + advisory { + identifiers { + type + value + } + summary + references { + url + } + severity + cwes(first: 10){ + nodes { + cweId + } + } + publishedAt } + firstPatchedVersion{ + identifier + } + package { + name + } + vulnerableVersionRange } - publishedAt } - firstPatchedVersion{ - identifier + pageInfo { + hasNextPage + endCursor } - package { - name - } - vulnerableVersionRange } } - pageInfo { - hasNextPage - endCursor - } - } -} -""" - - -class GitHubAPIImporter(Importer): - spdx_license_expression = "CC-BY-4.0" - importer_name = "GHSA Importer" - license_url = "https://github.com/github/advisory-database/blob/main/LICENSE.md" - - def advisory_data(self) -> Iterable[AdvisoryData]: - for ecosystem, package_type in PACKAGE_TYPE_BY_GITHUB_ECOSYSTEM.items(): + """ + for ecosystem, package_type in self.package_type_by_github_ecosystem.items(): end_cursor_exp = "" while True: - graphql_query = {"query": GRAPHQL_QUERY_TEMPLATE % (ecosystem, end_cursor_exp)} + graphql_query = {"query": advisory_query % (ecosystem, end_cursor_exp)} response = utils.fetch_github_graphql_query(graphql_query) page_info = get_item(response, "data", "securityVulnerabilities", "pageInfo") @@ -114,7 +154,7 @@ def advisory_data(self) -> Iterable[AdvisoryData]: break -def get_purl(pkg_type: str, github_name: str) -> Optional[PackageURL]: +def get_purl(pkg_type: str, github_name: str, logger: Callable = None) -> Optional[PackageURL]: """ Return a PackageURL by splitting the `github_name` using the `pkg_type` convention. Return None and log an error if we can not split or it is an @@ -129,7 +169,8 @@ def get_purl(pkg_type: str, github_name: str) -> Optional[PackageURL]: """ if pkg_type == "maven": if ":" not in github_name: - logger.error(f"get_purl: Invalid maven package name {github_name}") + if logger: + logger(f"get_purl: Invalid maven package name {github_name}", level=logging.ERROR) return ns, _, name = github_name.partition(":") return PackageURL(type=pkg_type, namespace=ns, name=name) @@ -143,18 +184,23 @@ def get_purl(pkg_type: str, github_name: str) -> Optional[PackageURL]: if pkg_type in ("nuget", "pypi", "gem", "golang", "npm", "cargo"): return PackageURL(type=pkg_type, name=github_name) - logger.error(f"get_purl: Unknown package type {pkg_type}") + if logger: + logger(f"get_purl: Unknown package type {pkg_type}", level=logging.ERROR) -def process_response(resp: dict, package_type: str) -> Iterable[AdvisoryData]: +def process_response( + resp: dict, package_type: str, logger: Callable = None +) -> Iterable[AdvisoryData]: """ Yield `AdvisoryData` by taking `resp` and `ecosystem` as input """ vulnerabilities = get_item(resp, "data", "securityVulnerabilities", "edges") or [] if not vulnerabilities: - logger.error( - f"No vulnerabilities found for package_type: {package_type!r} in response: {resp!r}" - ) + if logger: + logger( + f"No vulnerabilities found for package_type: {package_type!r} in response: {resp!r}", + level=logging.ERROR, + ) return for vulnerability in vulnerabilities: @@ -162,12 +208,14 @@ def process_response(resp: dict, package_type: str) -> Iterable[AdvisoryData]: affected_packages = [] github_advisory = get_item(vulnerability, "node") if not github_advisory: - logger.error(f"No node found in {vulnerability!r}") + if logger: + logger(f"No node found in {vulnerability!r}", level=logging.ERROR) continue advisory = get_item(github_advisory, "advisory") if not advisory: - logger.error(f"No advisory found in {github_advisory!r}") + if logger: + logger(f"No advisory found in {github_advisory!r}", level=logging.ERROR) continue summary = get_item(advisory, "summary") or "" @@ -183,7 +231,7 @@ def process_response(resp: dict, package_type: str) -> Iterable[AdvisoryData]: name = get_item(github_advisory, "package", "name") if name: - purl = get_purl(pkg_type=package_type, github_name=name) + purl = get_purl(pkg_type=package_type, github_name=name, logger=logger) if purl: affected_range = get_item(github_advisory, "vulnerableVersionRange") fixed_version = get_item(github_advisory, "firstPatchedVersion", "identifier") @@ -193,7 +241,11 @@ def process_response(resp: dict, package_type: str) -> Iterable[AdvisoryData]: package_type, affected_range ) except Exception as e: - logger.error(f"Could not parse affected range {affected_range!r} {e!r}") + if logger: + logger( + f"Could not parse affected range {affected_range!r} {e!r} \n {traceback_format_exc()}", + level=logging.ERROR, + ) affected_range = None if fixed_version: try: @@ -201,7 +253,11 @@ def process_response(resp: dict, package_type: str) -> Iterable[AdvisoryData]: fixed_version ) except Exception as e: - logger.error(f"Invalid fixed version {fixed_version!r} {e!r}") + if logger: + logger( + f"Invalid fixed version {fixed_version!r} {e!r} \n {traceback_format_exc()}", + level=logging.ERROR, + ) fixed_version = None if affected_range or fixed_version: affected_packages.append( @@ -236,9 +292,13 @@ def process_response(resp: dict, package_type: str) -> Iterable[AdvisoryData]: elif identifier_type == "CVE": pass else: - logger.error(f"Unknown identifier type {identifier_type!r} and value {value!r}") + if logger: + logger( + f"Unknown identifier type {identifier_type!r} and value {value!r}", + level=logging.ERROR, + ) - weaknesses = get_cwes_from_github_advisory(advisory) + weaknesses = get_cwes_from_github_advisory(advisory, logger) yield AdvisoryData( aliases=sorted(dedupe(aliases)), @@ -251,7 +311,7 @@ def process_response(resp: dict, package_type: str) -> Iterable[AdvisoryData]: ) -def get_cwes_from_github_advisory(advisory) -> [int]: +def get_cwes_from_github_advisory(advisory, logger=None) -> List[int]: """ Return the cwe-id list from advisory ex: [ 522 ] by extracting the cwe_list from advisory ex: [{'cweId': 'CWE-522'}] @@ -267,6 +327,7 @@ def get_cwes_from_github_advisory(advisory) -> [int]: try: db.get(cwe_id) weaknesses.append(cwe_id) - except Exception: - logger.error("Invalid CWE id") + except Exception as e: + if logger: + logger(f"Invalid CWE id {e!r} \n {traceback_format_exc()}", level=logging.ERROR) return weaknesses diff --git a/vulnerabilities/tests/test_github.py b/vulnerabilities/tests/pipelines/test_github_importer_pipeline.py similarity index 84% rename from vulnerabilities/tests/test_github.py rename to vulnerabilities/tests/pipelines/test_github_importer_pipeline.py index 2b5593137..d46e3ef19 100644 --- a/vulnerabilities/tests/test_github.py +++ b/vulnerabilities/tests/pipelines/test_github_importer_pipeline.py @@ -10,6 +10,7 @@ import json import os from datetime import datetime +from pathlib import Path from unittest import mock import pytest @@ -23,23 +24,22 @@ from vulnerabilities.importer import AffectedPackage from vulnerabilities.importer import Reference from vulnerabilities.importer import VulnerabilitySeverity -from vulnerabilities.importers.github import GitHubAPIImporter -from vulnerabilities.importers.github import get_cwes_from_github_advisory -from vulnerabilities.importers.github import process_response from vulnerabilities.improvers.valid_versions import GitHubBasicImprover +from vulnerabilities.pipelines.github_importer import GitHubAPIImporterPipeline +from vulnerabilities.pipelines.github_importer import get_cwes_from_github_advisory +from vulnerabilities.pipelines.github_importer import process_response +from vulnerabilities.tests.pipelines import TestLogger from vulnerabilities.tests.util_tests import VULNERABLECODE_REGEN_TEST_FIXTURES as REGEN -from vulnerabilities.utils import GitHubTokenError -BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -TEST_DATA = os.path.join(BASE_DIR, "test_data", "github_api") +TEST_DATA = Path(__file__).parent.parent / "test_data" / "github_api" @pytest.mark.parametrize( "pkg_type", ["maven", "nuget", "gem", "golang", "composer", "pypi", "npm", "cargo"] ) def test_process_response_github_importer(pkg_type, regen=REGEN): - response_file = os.path.join(TEST_DATA, f"{pkg_type}.json") - expected_file = os.path.join(TEST_DATA, f"{pkg_type}-expected.json") + response_file = TEST_DATA / f"{pkg_type}.json" + expected_file = TEST_DATA / f"{pkg_type}-expected.json" with open(response_file) as f: response = json.load(f) @@ -56,34 +56,49 @@ def test_process_response_github_importer(pkg_type, regen=REGEN): assert result == expected -def test_process_response_with_empty_vulnaribilities(caplog): - list(process_response({"data": {"securityVulnerabilities": {"edges": []}}}, "maven")) - assert "No vulnerabilities found for package_type: 'maven'" in caplog.text +def test_process_response_with_empty_vulnaribilities(): + logger = TestLogger() + list( + process_response( + {"data": {"securityVulnerabilities": {"edges": []}}}, + "maven", + logger=logger.write, + ) + ) + assert "No vulnerabilities found for package_type: 'maven'" in logger.getvalue() -def test_process_response_with_empty_vulnaribilities_2(caplog): +def test_process_response_with_empty_vulnaribilities_2(): + logger = TestLogger() list( process_response( - {"data": {"securityVulnerabilities": {"edges": [{"node": {}}, None]}}}, "maven" + {"data": {"securityVulnerabilities": {"edges": [{"node": {}}, None]}}}, + "maven", + logger=logger.write, ) ) - assert "No node found" in caplog.text + assert "No node found" in logger.getvalue() def test_github_importer_with_missing_credentials(): - with pytest.raises(GitHubTokenError) as e: - with mock.patch.dict(os.environ, {}, clear=True): - importer = GitHubAPIImporter() - list(importer.advisory_data()) + with mock.patch.dict(os.environ, {}, clear=True): + github_pipeline = GitHubAPIImporterPipeline() + status, error = github_pipeline.execute() + assert 1 == status + assert ( + "Cannot call GitHub API without a token set in the GH_TOKEN environment variable." + in error + ) @mock.patch("vulnerabilities.utils._get_gh_response") def test_github_importer_with_missing_credentials_2(mock_response): mock_response.return_value = {"message": "Bad credentials"} - with pytest.raises(GitHubTokenError) as e: - with mock.patch.dict(os.environ, {"GH_TOKEN": "FOOD"}, clear=True): - importer = GitHubAPIImporter() - list(importer.advisory_data()) + with mock.patch.dict(os.environ, {"GH_TOKEN": "FOOD"}, clear=True): + github_pipeline = GitHubAPIImporterPipeline() + status, error = github_pipeline.execute() + assert 1 == status + assert "Invalid GitHub token: Bad credentials" in error def valid_versions(): @@ -283,7 +298,7 @@ def test_github_improver(mock_response, regen=REGEN): @mock.patch("fetchcode.package_versions.get_response") def test_get_package_versions(mock_response): - with open(os.path.join(BASE_DIR, "test_data", "package_manager_data", "pypi.json"), "r") as f: + with open(TEST_DATA.parent / "package_manager_data" / "pypi.json", "r") as f: mock_response.return_value = json.load(f) improver = GitHubBasicImprover() diff --git a/vulnerabilities/tests/test_upstream.py b/vulnerabilities/tests/test_upstream.py index 925d28d80..ad5f50113 100644 --- a/vulnerabilities/tests/test_upstream.py +++ b/vulnerabilities/tests/test_upstream.py @@ -19,7 +19,7 @@ ) def test_updated_advisories(importer_name, importer_class): # FIXME: why are we doing this? - if importer_name.endswith("GitHubAPIImporter"): + if importer_name.endswith("GitHubAPIImporterPipeline"): return advisory_datas = importer_class().advisory_data() From 5386adcbf9f0c0e7f8ca4a229558e0613d6ff4fa Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Fri, 27 Sep 2024 20:01:32 +0530 Subject: [PATCH 2/4] Add pipeline_id to github pipeline Signed-off-by: Keshav Priyadarshi --- vulnerabilities/pipelines/github_importer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vulnerabilities/pipelines/github_importer.py b/vulnerabilities/pipelines/github_importer.py index d5df390b4..748674a58 100644 --- a/vulnerabilities/pipelines/github_importer.py +++ b/vulnerabilities/pipelines/github_importer.py @@ -37,6 +37,8 @@ class GitHubAPIImporterPipeline(VulnerableCodeBaseImporterPipeline): """Collect GitHub advisories.""" + pipeline_id = "github_importer" + spdx_license_expression = "CC-BY-4.0" license_url = "https://github.com/github/advisory-database/blob/main/LICENSE.md" importer_name = "GHSA Importer" From b6651a443a21e5d079b9736d3cd39259e075e220 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Fri, 27 Sep 2024 20:19:04 +0530 Subject: [PATCH 3/4] Add data migration for github advisory Signed-off-by: Keshav Priyadarshi --- .../0067_update_github_advisory_created_by.py | 38 +++++++++++++++++ vulnerabilities/tests/test_data_migrations.py | 41 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 vulnerabilities/migrations/0067_update_github_advisory_created_by.py diff --git a/vulnerabilities/migrations/0067_update_github_advisory_created_by.py b/vulnerabilities/migrations/0067_update_github_advisory_created_by.py new file mode 100644 index 000000000..4b9bb8485 --- /dev/null +++ b/vulnerabilities/migrations/0067_update_github_advisory_created_by.py @@ -0,0 +1,38 @@ +# Generated by Django 4.2.15 on 2024-09-27 14:31 + +from django.db import migrations + +""" +Update the created_by field on Advisory from the old qualified_name +to the new pipeline_id. +""" + + +def update_created_by(apps, schema_editor): + from vulnerabilities.pipelines.github_importer import GitHubAPIImporterPipeline + + Advisory = apps.get_model("vulnerabilities", "Advisory") + Advisory.objects.filter(created_by="vulnerabilities.importers.github.GitHubAPIImporter").update( + created_by=GitHubAPIImporterPipeline.pipeline_id + ) + + + +def reverse_update_created_by(apps, schema_editor): + from vulnerabilities.pipelines.github_importer import GitHubAPIImporterPipeline + + Advisory = apps.get_model("vulnerabilities", "Advisory") + Advisory.objects.filter(created_by=GitHubAPIImporterPipeline.pipeline_id).update( + created_by="vulnerabilities.importers.github.GitHubAPIImporter" + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ("vulnerabilities", "0066_update_gitlab_advisory_created_by"), + ] + + operations = [ + migrations.RunPython(update_created_by, reverse_code=reverse_update_created_by), + ] diff --git a/vulnerabilities/tests/test_data_migrations.py b/vulnerabilities/tests/test_data_migrations.py index 625e587ff..fcad0a1d4 100644 --- a/vulnerabilities/tests/test_data_migrations.py +++ b/vulnerabilities/tests/test_data_migrations.py @@ -761,3 +761,44 @@ def test_removal_of_duped_purls(self): adv.filter(created_by="vulnerabilities.importers.gitlab.GitLabAPIImporter").count() == 0 ) assert adv.filter(created_by="gitlab_importer").count() == 1 + + +class TestUpdateGitHubAdvisoryCreatedByField(TestMigrations): + app_name = "vulnerabilities" + migrate_from = "0066_update_gitlab_advisory_created_by" + migrate_to = "0067_update_github_advisory_created_by" + + advisory_data1 = AdvisoryData( + aliases=["CVE-2020-13371337"], + summary="vulnerability description here", + affected_packages=[ + AffectedPackage( + package=PackageURL(type="pypi", name="foobar"), + affected_version_range=VersionRange.from_string("vers:pypi/>=1.0.0|<=2.0.0"), + ) + ], + references=[Reference(url="https://example.com/with/more/info/CVE-2020-13371337")], + date_published=timezone.now(), + url="https://test.com", + ) + + def setUpBeforeMigration(self, apps): + Advisory = apps.get_model("vulnerabilities", "Advisory") + adv1 = Advisory.objects.create( + aliases=self.advisory_data1.aliases, + summary=self.advisory_data1.summary, + affected_packages=[pkg.to_dict() for pkg in self.advisory_data1.affected_packages], + references=[ref.to_dict() for ref in self.advisory_data1.references], + url=self.advisory_data1.url, + created_by="vulnerabilities.importers.github.GitHubAPIImporter", + date_collected=timezone.now(), + ) + + def test_removal_of_duped_purls(self): + Advisory = apps.get_model("vulnerabilities", "Advisory") + adv = Advisory.objects.all() + + assert ( + adv.filter(created_by="vulnerabilities.importers.github.GitHubAPIImporter").count() == 0 + ) + assert adv.filter(created_by="github_importer").count() == 1 From 1d3da91680ee68cb82c31690e1e8191515d521fd Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Fri, 27 Sep 2024 22:06:31 +0530 Subject: [PATCH 4/4] Use GraphQL to get the advisories_count Signed-off-by: Keshav Priyadarshi --- vulnerabilities/pipelines/github_importer.py | 41 ++++++-------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/vulnerabilities/pipelines/github_importer.py b/vulnerabilities/pipelines/github_importer.py index 748674a58..4603b939a 100644 --- a/vulnerabilities/pipelines/github_importer.py +++ b/vulnerabilities/pipelines/github_importer.py @@ -14,8 +14,6 @@ from typing import List from typing import Optional -import requests -from bs4 import BeautifulSoup from cwe2.database import Database from dateutil import parser as dateparser from packageurl import PackageURL @@ -61,36 +59,19 @@ def steps(cls): # "GO": "golang", } - github_ecosystem_by_package_type = { - value: key for (key, value) in package_type_by_github_ecosystem.items() - } - def advisories_count(self): - normalized_github_ecosystems = [ - k.lower() for k in self.package_type_by_github_ecosystem.keys() - ] - - try: - response = requests.get("https://github.com/advisories") - response.raise_for_status() - except requests.HTTPError as http_err: - self.log( - f"HTTP error occurred: {http_err} \n {traceback_format_exc()}", - level=logging.ERROR, - ) - return 0 - - soup = BeautifulSoup(response.text, "html.parser") + advisory_query = """ + query{ + securityVulnerabilities(first: 0, ecosystem: %s) { + totalCount + } + } + """ advisory_counts = 0 - for li in soup.select("ul.filter-list li") or []: - if link := li.find("a", class_="filter-item"): - ecosystem, _, _ = link.text.strip().rpartition(" ") - if count_span := li.find("span", class_="count"): - count = int(count_span.text.strip().replace(",", "")) - ecosystem = ecosystem.strip().lower() - if ecosystem in normalized_github_ecosystems: - advisory_counts += count - + for ecosystem in self.package_type_by_github_ecosystem.keys(): + graphql_query = {"query": advisory_query % (ecosystem)} + response = utils.fetch_github_graphql_query(graphql_query) + advisory_counts += get_item(response, "data", "securityVulnerabilities", "totalCount") return advisory_counts def collect_advisories(self) -> Iterable[AdvisoryData]: