8000 Ensure deps.dev data is fresh for each run of Criticality Score by calebbrown · Pull Request #267 · ossf/criticality_score · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Ensure deps.dev data is fresh for each run of Criticality Score #267

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions cmd/collect_signals/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"net/http"
"net/rpc"
"os"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -108,6 +109,17 @@ func main() {
gcpDatasetName = collector.DefaultGCPDatasetName
}

// Extract the GCP dataset TTL.
gcpDatasetTTLHours := criticalityConfig["dataset-ttl-hours"]
gcpDatasetTTL := time.Duration(0)
if gcpDatasetTTLHours != "" {
i, err := strconv.Atoi(gcpDatasetTTLHours)
if err != nil {
logger.With(zap.Error(err)).Fatal("Failed to get GCP Dataset TTL")
}
gcpDatasetTTL = time.Hour * time.Duration(i)
}

// Determine whether scoring is enabled or disabled.
// It supports various "truthy" and "fasley" values. It will default to
// enabled.
Expand Down Expand Up @@ -146,6 +158,7 @@ func main() {
collector.EnableAllSources(),
collector.GCPProject(gcpProjectID),
collector.GCPDatasetName(gcpDatasetName),
collector.GCPDatasetTTL(gcpDatasetTTL),
}

w, err := NewWorker(context.Background(), logger, scoringEnabled, scoringConfigFile, scoringColumnName, csvBucketURL, opts)
Expand Down
3 changes: 2 additions & 1 deletion cmd/collect_signals/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type collectWorker struct {
func (w *collectWorker) Process(ctx context.Context, req *data.ScorecardBatchRequest, bucketURL string) error {
filename := worker.ResultFilename(req)
jobTime := req.GetJobTime().AsTime()
jobID := jobTime.Format("20060102_150405")

// Prepare the logger with identifiers for the shard and job.
logger := w.logger.With(
Expand Down Expand Up @@ -83,7 +84,7 @@ func (w *collectWorker) Process(ctx context.Context, req *data.ScorecardBatchReq
repoLogger.With(zap.Error(err)).Warn("Failed to parse repo URL")
continue
}
ss, err := w.c.Collect(ctx, u)
ss, err := w.c.Collect(ctx, u, jobID)
if err != nil {
if errors.Is(err, collector.ErrUncollectableRepo) {
repoLogger.With(zap.Error(err)).Warn("Repo is uncollectable")
Expand Down
7 changes: 6 additions & 1 deletion cmd/criticality_score/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,12 @@ fail.
#### deps.dev Collection Flags

- `-depsdev-disable` disables the collection of signals from deps.dev.
- `-depsdev-dataset string` the BigQuery dataset name to use. Default is `depsdev_analysis`.
- `-depsdev-dataset string` the BigQuery dataset name to use. Default is
`depsdev_analysis`.
- `-depsdev-expiration hours` the default time-to-live or expiration for tables
created in the BigQuery dataset. New tables will be deleted after this
period. Expiration times on existing tables in the dataset won't be changed.
Default is `0` (no expiration).

#### Scoring flags

Expand Down
5 changes: 4 additions & 1 deletion cmd/criticality_score/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"os"
"path"
"strings"
"time"

"go.uber.org/zap"
"go.uber.org/zap/zapcore"
Expand All @@ -42,6 +43,7 @@ var (
gcpProjectFlag = flag.String("gcp-project-id", "", "the Google Cloud Project ID to use. Auto-detects by default.")
depsdevDisableFlag = flag.Bool("depsdev-disable", false, "disables the collection of signals from deps.dev.")
depsdevDatasetFlag = flag.String("depsdev-dataset", collector.DefaultGCPDatasetName, "the BigQuery dataset name to use.")
depsdevTTLFlag = flag.Int("depsdev-expiration", 0, "the default expiration (`hours`) to use for deps.dev tables. No expiration by default.")
scoringDisableFlag = flag.Bool("scoring-disable", false, "disables the generation of scores.")
scoringConfigFlag = flag.String("scoring-config", "", "path to a YAML file for configuring the scoring algorithm.")
scoringColumnNameFlag = flag.String("scoring-column", "", "manually specify the name for the column used to hold the score.")
Expand Down Expand Up @@ -145,6 +147,7 @@ func main() {
collector.EnableAllSources(),
collector.GCPProject(*gcpProjectFlag),
collector.GCPDatasetName(*depsdevDatasetFlag),
collector.GCPDatasetTTL(time.Hour * time.Duration(*depsdevTTLFlag)),
}
if *depsdevDisableFlag {
opts = append(opts, collector.DisableSource(collector.SourceTypeDepsDev))
Expand Down Expand Up @@ -191,7 +194,7 @@ func main() {
innerLogger := logger.With(zap.Int("worker", worker))
for u := range repos {
l := innerLogger.With(zap.String("url", u.String()))
ss, err := c.Collect(ctx, u)
ss, err := c.Collect(ctx, u, "")
if err != nil {
if errors.Is(err, collector.ErrUncollectableRepo) {
l.With(
Expand Down
1 change: 1 addition & 0 deletions infra/envs/prod/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ additional-params:
log-env: gcp
log-level: info
dataset: ossf_criticality_score_depsdev
dataset-ttl-hours: 672 # 4 weeks
scoring: enabled
scoring-config: config/scorer/pike_depsdev.yml
scoring-column-name: default_score
Expand Down
1 change: 1 addition & 0 deletions infra/envs/staging/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ additional-params:
log-env: gcp
log-level: info
dataset: ossf_criticality_score_depsdev_staging
dataset-ttl-hours: 168 # 1 week
scoring: enabled
scoring-config: config/scorer/pike_depsdev.yml
scoring-column-name: default_score
Expand Down
9 changes: 6 additions & 3 deletions internal/collector/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func New(ctx context.Context, logger *zap.Logger, opts ...Option) (*Collector, e
// deps.dev collection source has been disabled, so skip it.
logger.Warn("deps.dev signal source is disabled.")
} else {
ddsource, err := depsdev.NewSource(ctx, logger, c.config.gcpProject, c.config.gcpDatasetName)
ddsource, err := depsdev.NewSource(ctx, logger, c.config.gcpProject, c.config.gcpDatasetName, c.config.gcpDatasetTTL)
if err != nil {
return nil, fmt.Errorf("init deps.dev source: %w", err)
}
Expand All @@ -99,7 +99,10 @@ func (c *Collector) EmptySets() []signal.Set {
}

// Collect gathers and returns all the signals for the given project repo url.
func (c *Collector) Collect(ctx context.Context, u *url.URL) ([]signal.Set, error) {
//
// An optional jobID can be specified which can be used by underlying sources to
// manage caching. For simple usage this can be the empty string.
func (c *Collector) Collect(ctx context.Context, u *url.URL, jobID string) ([]signal.Set, error) {
l := c.config.logger.With(zap.String("url", u.String()))

repo, err := c.resolver.Resolve(ctx, u)
Expand All @@ -116,7 +119,7 @@ func (c *Collector) Collect(ctx context.Context, u *url.URL) ([]signal.Set, erro
l = l.With(zap.String("canonical_url", repo.URL().String()))

l.Info("Collecting")
ss, err := c.registry.Collect(ctx, repo)
ss, err := c.registry.Collect(ctx, repo, jobID)
if err != nil {
return nil, fmt.Errorf("collecting project: %w", err)
}
Expand Down
11 changes: 11 additions & 0 deletions internal/collector/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"context"
"fmt"
"net/http"
"time"

"github.com/go-logr/zapr"
"github.com/ossf/scorecard/v4/clients/githubrepo/roundtripper"
Expand Down Expand Up @@ -72,6 +73,7 @@ type config struct {

gcpProject string
gcpDatasetName string
gcpDatasetTTL time.Duration

sourceStatuses map[SourceType]sourceStatus
defaultSourceStatus sourceStatus
Expand All @@ -94,6 +96,7 @@ func makeConfig(ctx context.Context, logger *zap.Logger, opts ...Option) *config
gitHubHTTPClient: defaultGitHubHTTPClient(ctx, logger),
gcpProject: "",
gcpDatasetName: DefaultGCPDatasetName,
gcpDatasetTTL: time.Duration(0),
}

for _, opt := range opts {
Expand Down Expand Up @@ -175,3 +178,11 @@ func GCPDatasetName(n string) Option {
c.gcpDatasetName = n
})
}

// GCPDatasetTTL sets the time-to-live for tables created with GCP BigQuery
// datasets.
func GCPDatasetTTL(ttl time.Duration) Option {
return option(func(c *config) {
c.gcpDatasetTTL = ttl
})
}
9 changes: 9 additions & 0 deletions internal/collector/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package collector
import (
"context"
"testing"
"time"

"go.uber.org/zap/zaptest"
)
Expand Down Expand Up @@ -80,6 +81,14 @@ func TestGCPDatasetName(t *testing.T) {
}
}

func TestGCPDatasetTTL(t *testing.T) {
want := time.Duration(24) * time.Hour
c := makeTestConfig(t, GCPDatasetTTL(want))
if c.gcpDatasetTTL != want {
t.Fatalf("config.gcpDatasetTTL = %q, want %q", c.gcpDatasetTTL, want)
}
}

func makeTestConfig(t *testing.T, opts ...Option) *config {
t.Helper()
return makeConfig(context.Background(), zaptest.NewLogger(t), opts...)
Expand Down
30 changes: 26 additions & 4 deletions internal/collector/depsdev/bq.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package depsdev
import (
"context"
"errors"
"fmt"
"time"

"cloud.google.com/go/bigquery"
"google.golang.org/api/googleapi"
Expand All @@ -39,7 +41,8 @@ type bqAPI interface {
OneResultQuery(ctx context.Context, query string, params map[string]any, result any) error
NoResultQuery(ctx context.Context, query string, params map[string]any) error
GetDataset(ctx context.Context, id string) (*Dataset, error)
CreateDataset(ctx context.Context, id string) (*Dataset, error)
CreateDataset(ctx context.Context, id string, ttl time.Duration) (*Dataset, error)
UpdateDataset(ctx context.Context, d *Dataset, ttl time.Duration) error
GetTable(ctx context.Context, d *Dataset, id string) (*Table, error)
}

Expand Down Expand Up @@ -96,23 +99,42 @@ func (b *bq) GetDataset(ctx context.Context, id string) (*Dataset, error) {
return nil, nil
}
if err != nil {
return nil, err
return nil, fmt.Errorf("dataset metadata: %w", err)
}
return &Dataset{
ds: ds,
}, nil
}

func (b *bq) CreateDataset(ctx context.Context, id string) (*Dataset, error) {
func (b *bq) CreateDataset(ctx context.Context, id string, ttl time.Duration) (*Dataset, error) {
ds := b.client.Dataset(id)
if err := ds.Create(ctx, &bigquery.DatasetMetadata{}); err != nil {
if err := ds.Create(ctx, &bigquery.DatasetMetadata{DefaultTableExpiration: ttl}); err != nil {
return nil, err
}
return &Dataset{
ds: ds,
}, nil
}

func (b *bq) UpdateDataset(ctx context.Context, d *Dataset, ttl time.Duration) error {
md, err := d.ds.Metadata(ctx)
if err != nil {
return fmt.Errorf("dataset metadata: %w", err)
}
var update bigquery.DatasetMetadataToUpdate
needsWrite := false
if md.DefaultTableExpiration != ttl {
update.DefaultTableExpiration = ttl
needsWrite = true
}
if needsWrite {
if _, err := d.ds.Update(ctx, update, ""); err != nil {
return fmt.Errorf("dataset update metadata: %w", err)
}
}
return nil
}

func (b *bq) GetTable(ctx context.Context, d *Dataset, id string) (*Table, error) {
t := d.ds.Table(id)
md, err := t.Metadata(ctx)
Expand Down
Loading
0