Skip to content

Commit fe48fe9

Browse files
Add Provider authentication for DataSources (#5899)
* Initial draft of datasource auth, from prompt: > Modify the DataSourcesService to accept a `Provider` argument via the datasource Options field, and pass the provider to the data source constructor in BuildFromProtobuf. > > The feature should be controlled by two components: a flag called "authenticated_datasources" and a field on the DataSource protocol buffer message called `provider_auth` in the RestDataSource definition. * Adjust rest datasource to use provider methods when available * Add tests for datasources; improve coverage to 90% * Address comments / lint errors Adds documentation. * Store additional datasource metadata in database
1 parent f6fa9e2 commit fe48fe9

File tree

24 files changed

+1209
-293
lines changed

24 files changed

+1209
-293
lines changed

cmd/dev/app/rule_type/rttst.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ func testCmdRun(cmd *cobra.Command, _ []string) error {
190190
Alert: actionOptFromString(profile.Alert, models.ActionOptOff),
191191
}
192192

193-
dsRegistry, err := getDataSources(dataSourcefiles)
193+
dsRegistry, err := getDataSources(dataSourcefiles, prov)
194194
if err != nil {
195195
return fmt.Errorf("error getting data sources: %w", err)
196196
}
@@ -502,7 +502,7 @@ func actionOptFromString(s *string, defAction models.ActionOpt) models.ActionOpt
502502
return models.ActionOptUnknown
503503
}
504504

505-
func getDataSources(readers []*os.File) (*v1datasources.DataSourceRegistry, error) {
505+
func getDataSources(readers []*os.File, provider provifv1.Provider) (*v1datasources.DataSourceRegistry, error) {
506506
reg := v1datasources.NewDataSourceRegistry()
507507
for _, r := range readers {
508508
fname := r.Name()
@@ -515,7 +515,7 @@ func getDataSources(readers []*os.File) (*v1datasources.DataSourceRegistry, erro
515515
return nil, fmt.Errorf("error validating data source %s: %w", fname, err)
516516
}
517517

518-
intds, err := internalds.BuildFromProtobuf(ds)
518+
intds, err := internalds.BuildFromProtobuf(ds, provider)
519519
if err != nil {
520520
return nil, fmt.Errorf("error building data source %s: %w", fname, err)
521521
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
-- SPDX-FileCopyrightText: Copyright 2024 The Minder Authors
2+
-- SPDX-License-Identifier: Apache-2.0
3+
4+
BEGIN;
5+
6+
ALTER TABLE data_sources DROP COLUMN metadata;
7+
8+
COMMIT;
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
-- SPDX-FileCopyrightText: Copyright 2024 The Minder Authors
2+
-- SPDX-License-Identifier: Apache-2.0
3+
4+
BEGIN;
5+
6+
-- This migration adds support for storing data source metadata at
7+
-- the datasource level. This is stored as JSONB using an internal
8+
-- schema derived from (but separate than) protobuf. (This could
9+
-- also been used for the data source type or even rule storage, but
10+
-- it's not worth migrating at this time.)
11+
--
12+
-- NULL is equivalent to "empty object" for migration purposes.
13+
14+
ALTER TABLE data_sources
15+
ADD COLUMN metadata JSONB DEFAULT NULL;
16+
17+
COMMIT;

database/query/datasources.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
-- CreateDataSource creates a new datasource in a given project.
22

33
-- name: CreateDataSource :one
4-
INSERT INTO data_sources (project_id, name, display_name, subscription_id)
5-
VALUES ($1, $2, $3, sqlc.narg(subscription_id)) RETURNING *;
4+
INSERT INTO data_sources (project_id, name, display_name, subscription_id, metadata)
5+
VALUES ($1, $2, $3, sqlc.narg(subscription_id), sqlc.arg(metadata)::json) RETURNING *;
66

77
-- AddDataSourceFunction adds a function to a datasource.
88

@@ -14,7 +14,7 @@ VALUES ($1, $2, $3, $4, $5) RETURNING *;
1414

1515
-- name: UpdateDataSource :one
1616
UPDATE data_sources
17-
SET display_name = $3
17+
SET display_name = $3, metadata = sqlc.arg(metadata)::json
1818
WHERE id = $1 AND project_id = $2
1919
RETURNING *;
2020

docs/docs/ref/proto.mdx

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/docs/understand/data_sources.md

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,18 @@ While providers in Minder typically create or manage entities (e.g., repositorie
1212
- They do **not** create entities. Data sources only enhance an entity already known to Minder.
1313
- They can reference external services—for instance, pulling in vulnerability data from OSV or ClearlyDefined or a malware scanning service.
1414
- They have arguments that help shape the queries or requests the data source makes against external systems (e.g., specifying the package name, ecosystem, or version).
15+
- They can leverage the authentication from the current Provider to fetch additional authenticated data after the initial ingestion.
1516

1617
---
1718

1819
### Why Would You Use a *data source*?
1920

2021
You would create a data source in Minder whenever you need additional information about an entity that was not included in the initial ingest. Common scenarios include:
2122

22-
- **Enriching dependencies**: If a provider ingests a list of dependencies from a repository, a data source can query a vulnerability database (like OSV or ClearlyDefined) to see if any are known to be risky *from a security or licensing point of view*.
23-
- **Performing security checks**: A data source might call out to a malware scanner or an external REST service to verify the integrity of binaries or tarballs.
24-
- **Fetching attestation data**: If you need statements of provenance or supply-chain attestations from a separate system, a data source can gather this data for your entity.
23+
- **Followup queries**: In some cases, it may be necessary to fetch additional information to evaluate the state of the entity based on data from the initial ingestion. (For example, checking whether a workflow action has been passing after determining the relevant action.)
24+
- **Enriching dependencies**: If a provider ingests a list of dependencies from a repository, a data source can query a vulnerability database (like OSV or ClearlyDefined) to see if any are known to be risky *from a security or licensing point of view*.
25+
- **Performing security checks**: A data source might call out to a malware scanner or an external REST service to verify the integrity of binaries or tarballs.
26+
- **Fetching attestation data**: If you need statements of provenance or supply-chain attestations from a separate system, a data source can gather this data for your entity.
2527
- **Aggregating metadata from multiple sources**: For instance, combining ClearlyDefined’s scoring data with an internal database that tracks maintainers, deprecation status, or license data.
2628

2729
Essentially, data sources let Minder orchestrate external queries that feed into policy evaluations (e.g., Rego constraints) to create richer compliance, security, or operational checks.
@@ -32,14 +34,15 @@ Essentially, data sources let Minder orchestrate external queries that feed into
3234

3335
When you invoke a data source in a Rego policy, you typically provide a set of arguments. These arguments tell the data source *what* to fetch or *how* to fetch it.
3436

35-
For example, consider the YAML snippet below:
37+
For example, consider the two YAML snippets below:
3638

3739
```yaml
3840
version: v1
3941
type: data-source
4042
name: ghapi
4143
context: {}
4244
rest:
45+
providerAuth: true
4346
def:
4447
license:
4548
endpoint: https://api.github.com/repos/{owner}/{repo}/license
@@ -71,20 +74,46 @@ rest:
7174
type: string
7275
repo:
7376
type: string
77+
graphql:
78+
endpoint: https://api.github.com/graphql
79+
method: POST
80+
body_from_field: query
81+
input_schema:
82+
query:
83+
type: object
84+
properties:
85+
query:
86+
type: object
87+
# We don't specify properties here, but a caller might use:
88+
# {concat("", "repository(name:\"", repo "\", owner:\"", owner "\"") {rulesets(first:20) ...}}
89+
fallback:
90+
http_status: 200
91+
body: '{results: [], error: "Error fetching data"}'
7492
```
7593
7694
#### Key Fields
7795
78-
- **version / type / name**: Defines this resource as a data source called `ghapi`.
79-
- **context**: Typically holds the project context. Here it’s `{}`, meaning it’s globally available (or within your chosen project scope).
80-
- **rest**: Declares REST-based operations. Under `def`, we define three endpoints:
81-
- `license` → Fetches repository license info from GitHub
82-
- `repo_config` → Fetches general repo config (e.g., visibility, description, forks, watchers)
83-
- `private_vuln_reporting` → Fetches whether the repository has private vulnerability reporting enabled
84-
- **endpoint**: A template URI with placeholders for `{owner}` and `{repo}`.
85-
- **parse**: Indicates the response format (`json`).
86-
- **input_schema**: Uses JSON Schema to define the parameters needed by this data source in Rego. If you specify `input_schema` incorrectly, you will receive an error at runtime, helping ensure that the data you pass in matches what the data source expects.
96+
- **version / type / name**: Defines this resource as a data source called `ghapi`.
97+
- **context**: Typically holds the project context. Here it’s `{}`, meaning it’s globally available (or within your chosen project scope).
98+
- **rest**: Declares REST-based operations. If `providerAuth` is set to `true`, the provider's authentication mechanism will be used if the method's endpoint matches the provider's URL. Under `def`, we define three endpoints:
99+
- `license` → Fetches repository license info from GitHub
100+
- `repo_config` → Fetches general repo config (e.g., visibility, description, forks, watchers)
101+
- `private_vuln_reporting` → Fetches whether the repository has private vulnerability reporting enabled
102+
- `graphql` → Performs a GraphQL query
103+
104+
Each method defined in the rest endpoints has the following fields:
105+
106+
- **endpoint**: A [RFC 6570](https://tools.ietf.org/html/rfc6570) template URI with the supplied arguments (see [Using a data source in a Rule](#using-a-data-source-in-a-rule)).
107+
- **method**: The HTTP method to invoke. Defaults to `GET`.
108+
- **headers**: A key-value map of static headers to add to the request.
109+
- **bodyobj**: Specifies the request body as a static JSON object.
110+
- **bodystr**: Specifies the request body as a static string.
111+
- **body_from_field**: Specifies that the request body should be produced from the specified argument. Objects will be converted to JSON representation, while strings will be used as an exact request body.
112+
- **parse**: Indicates the response format (`json`). If unset, the result will be the body as a string.
113+
- **input_schema**: Uses JSON Schema to define the parameters needed by this data source in Rego. If you specify `input_schema` incorrectly, you will receive an error at runtime, helping ensure that the data you pass in matches what the data source expects.
87114
- *(Note: You can define additional properties as needed, but only fields explicitly handled by the data source code will be recognized.)*
115+
- **expected_status**: Defines the expected response code. The default expected code is 200. If an unexpected response code is received, an error will be raised.
116+
- **fallback**: If the request fails after 4 attempts and a fallback is defined, the specified **http_status** and **body** will be returned.
88117

89118
---
90119

internal/datasources/factory.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ import (
1111
"github.com/mindersec/minder/internal/datasources/structured"
1212
minderv1 "github.com/mindersec/minder/pkg/api/protobuf/go/minder/v1"
1313
v1datasources "github.com/mindersec/minder/pkg/datasources/v1"
14+
provinfv1 "github.com/mindersec/minder/pkg/providers/v1"
1415
)
1516

1617
// BuildFromProtobuf is a factory function that builds a new data source based on the given
1718
// data source type.
18-
func BuildFromProtobuf(ds *minderv1.DataSource) (v1datasources.DataSource, error) {
19+
func BuildFromProtobuf(ds *minderv1.DataSource, provider provinfv1.Provider) (v1datasources.DataSource, error) {
1920
if ds == nil {
2021
return nil, fmt.Errorf("data source is nil")
2122
}
@@ -28,7 +29,7 @@ func BuildFromProtobuf(ds *minderv1.DataSource) (v1datasources.DataSource, error
2829
case *minderv1.DataSource_Structured:
2930
return structured.NewStructDataSource(ds.GetStructured())
3031
case *minderv1.DataSource_Rest:
31-
return rest.NewRestDataSource(ds.GetRest())
32+
return rest.NewRestDataSource(ds.GetRest(), provider)
3233
default:
3334
return nil, fmt.Errorf("unknown data source type: %T", ds)
3435
}

0 commit comments

Comments
 (0)