diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bd85823e5..73408b6f8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -71,12 +71,12 @@ jobs: - name: extract amd64 binary from the image run: | docker create --platform linux/amd64 --name amd64 ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.meta.outputs.version }} && - docker cp amd64:/opt/coroot/coroot /tmp/coroot-amd64 + docker cp amd64:/usr/bin/coroot /tmp/coroot-amd64 - name: extract arm64 binary from the image run: | docker create --platform linux/arm64 --name arm64 ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.meta.outputs.version }} && - docker cp arm64:/opt/coroot/coroot /tmp/coroot-arm64 + docker cp arm64:/usr/bin/coroot /tmp/coroot-arm64 - name: upload amd64 binary uses: actions/upload-release-asset@v1 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 36484d77a..eca3320aa 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -25,6 +25,20 @@ go run main.go Open http://127.0.0.1:8080 in your browser. +## Running locally (docker) + +Build: +```shell +docker build -f dev.dockerfile -t coroot-dev . +``` + +Run: +```shell +docker run --rm -p 8080:127.0.0.1:8080 -d coroot-dev +``` + +Open http://127.0.0.1:8080 in your browser. + ## Remote Debug How to use Goland and Dlv for remote debug of Golang code on a remote server. diff --git a/Dockerfile b/Dockerfile index 78beccc65..da296a3aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,13 +9,24 @@ ARG VERSION=unknown RUN go build -mod=readonly -ldflags "-X main.version=$VERSION" -o coroot . -FROM debian:bullseye -RUN apt update && apt install -y ca-certificates && apt clean +FROM registry.access.redhat.com/ubi9/ubi -WORKDIR /opt/coroot -COPY --from=backend-builder /tmp/src/coroot /opt/coroot/coroot +ARG VERSION=unknown +LABEL name="coroot" \ + vendor="Coroot, Inc." \ + maintainer="Coroot, Inc." \ + version=${VERSION} \ + release="1" \ + summary="Coroot Community Edition." \ + description="Coroot Community Edition container image." + +COPY LICENSE /licenses/LICENSE + +COPY --from=backend-builder /tmp/src/coroot /usr/bin/coroot +RUN mkdir /data && chown 65534:65534 /data +USER 65534:65534 VOLUME /data EXPOSE 8080 -ENTRYPOINT ["/opt/coroot/coroot"] +ENTRYPOINT ["/usr/bin/coroot"] diff --git a/README.md b/README.md index 945c73c2a..0c375d542 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![](https://github.com/coroot/coroot/actions/workflows/ci.yml/badge.svg) [![Go Report Card](https://goreportcard.com/badge/github.com/coroot/coroot)](https://goreportcard.com/report/github.com/coroot/coroot) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -[![](https://img.shields.io/badge/slack-coroot-brightgreen.svg?logo=slack)](https://join.slack.com/t/coroot-community/shared_invite/zt-1gsnfo0wj-I~Zvtx5CAAb8vr~r~vecyw) +[![](https://img.shields.io/badge/slack-coroot-brightgreen.svg?logo=slack)](https://coroot.com/join-slack-community/) ### [Features](#features) | [Installation](https://docs.coroot.com/) | [Documentation](https://docs.coroot.com/) | [Community & Support](#community--support) | [Live demo](https://demo.coroot.com/) | [Coroot Enterprise](https://coroot.com/enterprise/) @@ -114,7 +114,7 @@ A live demo of Coroot is available at [demo.coroot.com](https://demo.coroot.com/ ## Community & Support -* [Community Slack](https://join.slack.com/t/coroot-community/shared_invite/zt-1gsnfo0wj-I~Zvtx5CAAb8vr~r~vecyw) +* [Community Slack](https://coroot.com/join-slack-community/) * [GitHub Discussions](https://github.com/coroot/coroot/discussions) * [GitHub Issues](https://github.com/coroot/coroot/issues) * Twitter: [@coroot_com](https://twitter.com/coroot_com) diff --git a/api/api.go b/api/api.go index 08dfb3ae7..5b55b17cd 100644 --- a/api/api.go +++ b/api/api.go @@ -2,10 +2,12 @@ package api import ( "context" + "encoding/json" "errors" "fmt" "net/http" "slices" + "sort" "time" "github.com/coroot/coroot/api/forms" @@ -23,6 +25,7 @@ import ( "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" "github.com/gorilla/mux" + "golang.org/x/exp/maps" "k8s.io/klog" ) @@ -169,6 +172,7 @@ func (api *Api) Roles(w http.ResponseWriter, r *http.Request, u *db.User) { rbac.NewPermission(rbac.ScopeProjectInstrumentations, rbac.ActionEdit, nil), rbac.NewPermission(rbac.ScopeApplication, rbac.ActionView, rbac.Object{"application_category": "databases"}), rbac.NewPermission(rbac.ScopeNode, rbac.ActionView, rbac.Object{"node_name": "db*"}), + rbac.NewPermission(rbac.ScopeDashboard, rbac.ActionView, rbac.Object{"dashboard_name": "db*"}), ) roles, err := api.roles.GetRoles() if err != nil { @@ -198,6 +202,13 @@ func (api *Api) SSO(w http.ResponseWriter, r *http.Request, u *db.User) { utils.WriteJson(w, res) } +func (api *Api) AI(w http.ResponseWriter, r *http.Request, u *db.User) { + res := struct { + Provider string `json:"provider"` + }{} + utils.WriteJson(w, res) +} + func (api *Api) Project(w http.ResponseWriter, r *http.Request, u *db.User) { vars := mux.Vars(r) projectId := vars["project"] @@ -226,7 +237,7 @@ func (api *Api) Project(w http.ResponseWriter, r *http.Request, u *db.User) { return } prometheusCfg := project.PrometheusConfig(api.globalPrometheus) - res.Readonly = !project.Settings.Configurable + res.Readonly = project.Settings.Readonly res.Name = project.Name res.RefreshInterval = prometheusCfg.RefreshInterval if isAllowed { @@ -253,7 +264,7 @@ func (api *Api) Project(w http.ResponseWriter, r *http.Request, u *db.User) { Id: db.ProjectId(projectId), Name: form.Name, } - project.Settings.Configurable = true + project.Settings.Readonly = false err := api.db.SaveProject(project) if err != nil { if errors.Is(err, db.ErrConflict) { @@ -325,6 +336,11 @@ func (api *Api) Overview(w http.ResponseWriter, r *http.Request, u *db.User) { http.Error(w, "You are not allowed to view traces.", http.StatusForbidden) return } + case "logs": + if !api.IsAllowed(u, rbac.Actions.Project(projectId).Logs().View()) { + http.Error(w, "You are not allowed to view logs.", http.StatusForbidden) + return + } case "costs": if !api.IsAllowed(u, rbac.Actions.Project(projectId).Costs().View()) { http.Error(w, "You are not allowed to view costs.", http.StatusForbidden) @@ -355,6 +371,131 @@ func (api *Api) Overview(w http.ResponseWriter, r *http.Request, u *db.User) { utils.WriteJson(w, api.WithContext(project, cacheStatus, world, views.Overview(r.Context(), ch, world, view, r.URL.Query().Get("query")))) } +func (api *Api) Dashboards(w http.ResponseWriter, r *http.Request, u *db.User) { + world, project, cacheStatus, err := api.LoadWorldByRequest(r) + if err != nil { + klog.Errorln(err) + http.Error(w, "", http.StatusInternalServerError) + return + } + if project == nil || world == nil { + utils.WriteJson(w, api.WithContext(project, cacheStatus, world, nil)) + return + } + + vars := mux.Vars(r) + id := vars["dashboard"] + + if r.Method == http.MethodPost { + if !api.IsAllowed(u, rbac.Actions.Project(string(project.Id)).Dashboards().Edit()) { + http.Error(w, "You are not allowed to configure dashboards.", http.StatusForbidden) + return + } + var form forms.DashboardForm + if err = forms.ReadAndValidate(r, &form); err != nil { + klog.Warningln("bad request:", err) + http.Error(w, "", http.StatusBadRequest) + return + } + switch form.Action { + case "create": + id, err = api.db.CreateDashboard(project.Id, form.Name, form.Description) + if err == nil { + http.Error(w, id, http.StatusCreated) + return + } + case "update": + err = api.db.UpdateDashboard(project.Id, id, form.Name, form.Description) + case "delete": + err = api.db.DeleteDashboard(project.Id, id) + default: + err = api.db.SaveDashboardConfig(project.Id, id, form.Dashboard.Config) + } + if err != nil { + klog.Errorf("failed to %s dashboard: %s", form.Action, err) + http.Error(w, "", http.StatusInternalServerError) + return + } + return + } + + auditor.Audit(world, project, nil, project.ClickHouseConfig(api.globalClickHouse) != nil, nil) + + if id != "" { + dashboard, err := api.db.GetDashboard(project.Id, id) + if err != nil { + if errors.Is(err, db.ErrNotFound) { + klog.Warningln("dashboard not found:", id) + http.Error(w, "Dashboard not found", http.StatusNotFound) + return + } + klog.Errorln(err) + http.Error(w, "", http.StatusInternalServerError) + return + } + if !api.IsAllowed(u, rbac.Actions.Project(string(project.Id)).Dashboard(dashboard.Name).View()) { + http.Error(w, "You are not allowed to view this dashboard.", http.StatusForbidden) + return + } + utils.WriteJson(w, api.WithContext(project, cacheStatus, world, views.Dashboards.Dashboard(dashboard))) + return + } + + dashboards, err := api.db.GetDashboards(project.Id) + if err != nil { + klog.Errorln(err) + http.Error(w, "", http.StatusInternalServerError) + return + } + utils.WriteJson(w, api.WithContext(project, cacheStatus, world, views.Dashboards.List(dashboards))) +} + +func (api *Api) PanelData(w http.ResponseWriter, r *http.Request, u *db.User) { + projectId := db.ProjectId(mux.Vars(r)["project"]) + project, err := api.db.GetProject(projectId) + if err != nil { + if errors.Is(err, db.ErrNotFound) { + klog.Warningln("project not found:", projectId) + http.Error(w, "Project not found", http.StatusNotFound) + return + } + klog.Errorln(err) + http.Error(w, "", http.StatusInternalServerError) + return + } + + query := r.URL.Query().Get("query") + var config db.DashboardPanel + err = json.Unmarshal([]byte(query), &config) + if err != nil { + klog.Warningln("invalid query:", query) + http.Error(w, "Invalid query", http.StatusBadRequest) + return + } + + promConfig := project.PrometheusConfig(api.globalPrometheus) + cfg := prom.NewClientConfig(promConfig.Url, promConfig.RefreshInterval) + cfg.BasicAuth = promConfig.BasicAuth + cfg.TlsSkipVerify = promConfig.TlsSkipVerify + cfg.ExtraSelector = promConfig.ExtraSelector + cfg.CustomHeaders = promConfig.CustomHeaders + promClient, err := prom.NewClient(cfg) + if err != nil { + klog.Errorln(err) + http.Error(w, "", http.StatusInternalServerError) + return + } + from, to := api.getTimeContext(r) + step := increaseStepForBigDurations(from, to, promConfig.RefreshInterval) + data, err := views.Dashboards.PanelData(r.Context(), promClient, config, from, to, step) + if err != nil { + klog.Errorln(err) + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + utils.WriteJson(w, data) +} + func (api *Api) ApiKeys(w http.ResponseWriter, r *http.Request, u *db.User) { vars := mux.Vars(r) projectId := vars["project"] @@ -373,7 +514,7 @@ func (api *Api) ApiKeys(w http.ResponseWriter, r *http.Request, u *db.User) { Editable bool `json:"editable"` Keys []db.ApiKey `json:"keys"` }{ - Editable: isAllowed && project.Settings.Configurable, + Editable: isAllowed && !project.Settings.Readonly, Keys: project.Settings.ApiKeys, } if !isAllowed { @@ -431,22 +572,46 @@ func (api *Api) Inspections(w http.ResponseWriter, r *http.Request, u *db.User) utils.WriteJson(w, views.Inspections(checkConfigs)) } -func (api *Api) Categories(w http.ResponseWriter, r *http.Request, u *db.User) { +func (api *Api) ApplicationCategories(w http.ResponseWriter, r *http.Request, u *db.User) { vars := mux.Vars(r) projectId := vars["project"] + project, err := api.db.GetProject(db.ProjectId(projectId)) + if err != nil { + klog.Errorln(err) + http.Error(w, "", http.StatusInternalServerError) + return + } + if r.Method == http.MethodPost { if !api.IsAllowed(u, rbac.Actions.Project(projectId).ApplicationCategories().Edit()) { http.Error(w, "You are not allowed to configure application categories.", http.StatusForbidden) return } var form forms.ApplicationCategoryForm - if err := forms.ReadAndValidate(r, &form); err != nil { + if err = forms.ReadAndValidate(r, &form); err != nil { klog.Warningln("bad request:", err) http.Error(w, "Invalid name or patterns", http.StatusBadRequest) return } - if err := api.db.SaveApplicationCategory(db.ProjectId(projectId), form.Name, form.NewName, form.CustomPatterns, form.NotifyOfDeployments); err != nil { + var category *db.ApplicationCategory + switch form.Action { + case "test": + err = form.SendTestNotification(r.Context(), project) + if err != nil { + klog.Warningln("failed to send test notification:", err) + http.Error(w, err.Error(), http.StatusBadRequest) + } + return + case "delete": + default: + category = &form.ApplicationCategory + } + if err = api.db.SaveApplicationCategory(project, form.Id, category); err != nil { + if errors.Is(err, db.ErrConflict) { + http.Error(w, "Application category already exists.", http.StatusConflict) + return + } klog.Errorln("failed to save:", err) http.Error(w, "", http.StatusInternalServerError) return @@ -454,13 +619,31 @@ func (api *Api) Categories(w http.ResponseWriter, r *http.Request, u *db.User) { return } - p, err := api.db.GetProject(db.ProjectId(projectId)) - if err != nil { - klog.Errorln(err) - http.Error(w, "", http.StatusInternalServerError) + categories := project.GetApplicationCategories() + if !r.URL.Query().Has("name") { + cs := maps.Values(categories) + sort.Slice(cs, func(i, j int) bool { + if cs[i].Builtin != cs[j].Builtin { + return cs[i].Builtin + } + return cs[i].Name < cs[j].Name + }) + utils.WriteJson(w, cs) + return + } + name := model.ApplicationCategory(r.URL.Query().Get("name")) + if name == "" { + category := project.NewApplicationCategory() + utils.WriteJson(w, forms.ApplicationCategoryForm{ApplicationCategory: *category}) return } - utils.WriteJson(w, views.Categories(p)) + category := categories[name] + if category == nil { + klog.Warningln("unknown application category:", name) + http.Error(w, "Unknown application category: "+string(name), http.StatusNotFound) + return + } + utils.WriteJson(w, forms.ApplicationCategoryForm{Id: category.Name, ApplicationCategory: *category}) } func (api *Api) CustomApplications(w http.ResponseWriter, r *http.Request, u *db.User) { @@ -494,6 +677,46 @@ func (api *Api) CustomApplications(w http.ResponseWriter, r *http.Request, u *db utils.WriteJson(w, views.CustomApplications(p)) } +func (api *Api) CustomCloudPricing(w http.ResponseWriter, r *http.Request, u *db.User) { + vars := mux.Vars(r) + projectId := vars["project"] + p, err := api.db.GetProject(db.ProjectId(projectId)) + + if err != nil { + klog.Errorln(err) + http.Error(w, "", http.StatusInternalServerError) + return + } + if r.Method == http.MethodGet { + utils.WriteJson(w, p.Settings.CustomCloudPricing) + return + } + if !api.IsAllowed(u, rbac.Actions.Project(projectId).CustomCloudPricing().Edit()) { + http.Error(w, "You are not allowed to configure custom cloud pricing.", http.StatusForbidden) + return + } + switch r.Method { + case http.MethodDelete: + p.Settings.CustomCloudPricing = nil + case http.MethodPost: + var form forms.CustomCloudPricingForm + if err := forms.ReadAndValidate(r, &form); err != nil { + klog.Warningln("bad request:", err) + http.Error(w, "Invalid form", http.StatusBadRequest) + return + } + p.Settings.CustomCloudPricing = &form.CustomCloudPricing + default: + http.Error(w, "", http.StatusMethodNotAllowed) + return + } + if err := api.db.SaveProjectSettings(p); err != nil { + klog.Errorln("failed to save:", err) + http.Error(w, "", http.StatusInternalServerError) + return + } +} + func (api *Api) Integrations(w http.ResponseWriter, r *http.Request, u *db.User) { vars := mux.Vars(r) projectId := vars["project"] @@ -517,14 +740,23 @@ func (api *Api) Integrations(w http.ResponseWriter, r *http.Request, u *db.User) return } - p, err := api.db.GetProject(db.ProjectId(projectId)) + project, err := api.db.GetProject(db.ProjectId(projectId)) if err != nil { klog.Errorln(err) http.Error(w, "", http.StatusInternalServerError) return } - utils.WriteJson(w, views.Integrations(p)) + integrations := project.Settings.Integrations + utils.WriteJson(w, struct { + BaseUrl string `json:"base_url"` + Integrations []db.IntegrationInfo `json:"integrations"` + Readonly bool `json:"readonly"` + }{ + BaseUrl: integrations.BaseUrl, + Integrations: integrations.GetInfo(), + Readonly: integrations.Readonly, + }) } func (api *Api) Integration(w http.ResponseWriter, r *http.Request, u *db.User) { @@ -628,7 +860,7 @@ func (api *Api) Prom(w http.ResponseWriter, r *http.Request, u *db.User) { http.Error(w, "", http.StatusInternalServerError) return } - p := project.Prometheus + p := project.PrometheusConfig(api.globalPrometheus) cfg := prom.NewClientConfig(p.Url, p.RefreshInterval) cfg.BasicAuth = p.BasicAuth cfg.TlsSkipVerify = p.TlsSkipVerify @@ -703,15 +935,15 @@ func (api *Api) Incident(w http.ResponseWriter, r *http.Request, u *db.User) { incidentKey := vars["incident"] incident, err := api.db.GetIncidentByKey(db.ProjectId(projectId), incidentKey) if err != nil { + if errors.Is(err, db.ErrNotFound) { + klog.Warningln("incident not found:", vars["key"]) + http.Error(w, "Incident not found", http.StatusNotFound) + return + } klog.Warningln("failed to get incident:", err) http.Error(w, "failed to get incident", http.StatusInternalServerError) return } - if incident == nil { - klog.Warningln("incident not found:", vars["key"]) - http.Error(w, "Incident not found", http.StatusNotFound) - return - } values := r.URL.Query() values.Add("incident", incidentKey) r.URL.RawQuery = values.Encode() @@ -751,37 +983,75 @@ func (api *Api) Inspection(w http.ResponseWriter, r *http.Request, u *db.User) { } checkId := model.CheckId(vars["type"]) - switch r.Method { - case http.MethodGet: - project, err := api.db.GetProject(db.ProjectId(projectId)) - if err != nil { - klog.Errorln("failed to get project:", err) - http.Error(w, "", http.StatusInternalServerError) + world, project, _, err := api.LoadWorldByRequest(r) + if err != nil { + klog.Errorln(err) + http.Error(w, "", http.StatusInternalServerError) + return + } + if world == nil { + http.Error(w, "Application not found", http.StatusNotFound) + return + } + + var app *model.Application + var category model.ApplicationCategory + if !appId.IsZero() { + app = world.GetApplication(appId) + if app == nil { + klog.Warningln("application not found:", appId) + http.Error(w, "Application not found", http.StatusNotFound) return } - checkConfigs, err := api.db.GetCheckConfigs(db.ProjectId(projectId)) + category = app.Category + } + + switch r.Method { + case http.MethodGet: + checkConfigs, err := api.db.GetCheckConfigs(project.Id) if err != nil { klog.Errorln("failed to get check configs:", err) http.Error(w, "", http.StatusInternalServerError) return } - res := struct { - Form any `json:"form"` - Integrations map[string]string `json:"integrations"` - }{ - Integrations: map[string]string{}, + type Integration struct { + Name string `json:"name"` + Details string `json:"details"` } - for _, i := range project.Settings.Integrations.GetInfo() { - if i.Configured && i.Incidents { - res.Integrations[i.Title] = i.Details + res := struct { + Form any `json:"form"` + Integrations []Integration `json:"integrations"` + }{} + + if app != nil { + if categorySettings := project.GetApplicationCategories()[app.Category]; categorySettings != nil { + notificationSettings := categorySettings.NotificationSettings.Incidents + if notificationSettings.Enabled { + if slack := notificationSettings.Slack; slack != nil && slack.Enabled { + res.Integrations = append(res.Integrations, Integration{Name: "Slack", Details: fmt.Sprintf("channel: #%s", slack.Channel)}) + } + if teams := notificationSettings.Teams; teams != nil && teams.Enabled { + res.Integrations = append(res.Integrations, Integration{Name: "MS Teams"}) + } + if pagerduty := notificationSettings.Pagerduty; pagerduty != nil && pagerduty.Enabled { + res.Integrations = append(res.Integrations, Integration{Name: "Pagerduty"}) + } + if opsgenie := notificationSettings.Opsgenie; opsgenie != nil && opsgenie.Enabled { + res.Integrations = append(res.Integrations, Integration{Name: "Opsgenie"}) + } + if webhook := notificationSettings.Webhook; webhook != nil && webhook.Enabled { + res.Integrations = append(res.Integrations, Integration{Name: "Webhook"}) + } + } } } + switch checkId { case model.Checks.SLOAvailability.Id: cfg, def := checkConfigs.GetAvailability(appId) res.Form = forms.CheckConfigSLOAvailabilityForm{Configs: []model.CheckConfigSLOAvailability{cfg}, Default: def} case model.Checks.SLOLatency.Id: - cfg, def := checkConfigs.GetLatency(appId, model.CalcApplicationCategory(appId, project.Settings.ApplicationCategories)) + cfg, def := checkConfigs.GetLatency(appId, category) res.Form = forms.CheckConfigSLOLatencyForm{Configs: []model.CheckConfigSLOLatency{cfg}, Default: def} default: form := forms.CheckConfigForm{ @@ -809,19 +1079,19 @@ func (api *Api) Inspection(w http.ResponseWriter, r *http.Request, u *db.User) { http.Error(w, "", http.StatusBadRequest) return } - if err := api.db.SaveCheckConfig(db.ProjectId(projectId), appId, checkId, form.Configs); err != nil { + if err = api.db.SaveCheckConfig(db.ProjectId(projectId), appId, checkId, form.Configs); err != nil { klog.Errorln("failed to save check config:", err) http.Error(w, "", http.StatusInternalServerError) return } case model.Checks.SLOLatency.Id: var form forms.CheckConfigSLOLatencyForm - if err := forms.ReadAndValidate(r, &form); err != nil { + if err = forms.ReadAndValidate(r, &form); err != nil { klog.Warningln("bad request:", err) http.Error(w, "", http.StatusBadRequest) return } - if err := api.db.SaveCheckConfig(db.ProjectId(projectId), appId, checkId, form.Configs); err != nil { + if err = api.db.SaveCheckConfig(db.ProjectId(projectId), appId, checkId, form.Configs); err != nil { klog.Errorln("failed to save check config:", err) http.Error(w, "", http.StatusInternalServerError) return @@ -843,7 +1113,7 @@ func (api *Api) Inspection(w http.ResponseWriter, r *http.Request, u *db.User) { case 2: id = appId } - if err := api.db.SaveCheckConfig(db.ProjectId(projectId), id, checkId, cfg); err != nil { + if err = api.db.SaveCheckConfig(db.ProjectId(projectId), id, checkId, cfg); err != nil { klog.Errorln("failed to save check config:", err) http.Error(w, "", http.StatusInternalServerError) return @@ -1221,12 +1491,10 @@ func (api *Api) LoadWorld(ctx context.Context, project *db.Project, from, to tim return nil, cacheStatus, err } - duration := to.Sub(from) if cacheTo.Before(to) { to = cacheTo - duration = to.Sub(from) } - step = increaseStepForBigDurations(duration, step) + step = increaseStepForBigDurations(from, to, step) ctr := constructor.New(api.db, project, cacheClient, api.pricing) world, err := ctr.LoadWorld(ctx, from, to, step, nil) @@ -1244,13 +1512,26 @@ func (api *Api) LoadWorldByRequest(r *http.Request) (*model.World, *db.Project, return nil, nil, nil, err } + from, to := api.getTimeContext(r) + world, cacheStatus, err := api.LoadWorld(r.Context(), project, from, to) + if world == nil { + step := increaseStepForBigDurations(from, to, 15*timeseries.Second) + world = model.NewWorld(from, to.Add(-step), step, step) + } + return world, project, cacheStatus, err +} + +func (api *Api) getTimeContext(r *http.Request) (from timeseries.Time, to timeseries.Time) { now := timeseries.Now() q := r.URL.Query() - from := utils.ParseTime(now, q.Get("from"), now.Add(-timeseries.Hour)) - to := utils.ParseTime(now, q.Get("to"), now) - + from = utils.ParseTime(now, q.Get("from"), now.Add(-timeseries.Hour)) + to = utils.ParseTime(now, q.Get("to"), now) + if from >= to { + from = to.Add(-timeseries.Hour) + } incidentKey := q.Get("incident") if incidentKey != "" { + projectId := db.ProjectId(mux.Vars(r)["project"]) if incident, err := api.db.GetIncidentByKey(projectId, incidentKey); err != nil { klog.Warningln("failed to get incident:", err) } else { @@ -1263,16 +1544,11 @@ func (api *Api) LoadWorldByRequest(r *http.Request) (*model.World, *db.Project, } } } - - world, cacheStatus, err := api.LoadWorld(r.Context(), project, from, to) - if world == nil { - step := increaseStepForBigDurations(to.Sub(from), 15*timeseries.Second) - world = model.NewWorld(from, to.Add(-step), step, step) - } - return world, project, cacheStatus, err + return } -func increaseStepForBigDurations(duration, step timeseries.Duration) timeseries.Duration { +func increaseStepForBigDurations(from, to timeseries.Time, step timeseries.Duration) timeseries.Duration { + duration := to.Sub(from) switch { case duration > 5*timeseries.Day: return maxDuration(step, 60*timeseries.Minute) diff --git a/api/forms/forms.go b/api/forms/forms.go index 3daf0e22a..e07e8ffe7 100644 --- a/api/forms/forms.go +++ b/api/forms/forms.go @@ -1,6 +1,7 @@ package forms import ( + "cmp" "context" "errors" "fmt" @@ -15,6 +16,7 @@ import ( "github.com/coroot/coroot/model" "github.com/coroot/coroot/notifications" "github.com/coroot/coroot/prom" + "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" ) @@ -59,6 +61,15 @@ func (f *ApiKeyForm) Valid() bool { return true } +type DashboardForm struct { + Action string `json:"action"` + db.Dashboard +} + +func (f *DashboardForm) Valid() bool { + return f.Name != "" +} + type CheckConfigForm struct { Configs []*model.CheckConfigSimple `json:"configs"` } @@ -96,24 +107,27 @@ func (f *CheckConfigSLOLatencyForm) Valid() bool { } type ApplicationCategoryForm struct { - Name model.ApplicationCategory `json:"name"` - NewName model.ApplicationCategory `json:"new_name"` - - CustomPatternsStr string `json:"custom_patterns"` - CustomPatterns []string - - NotifyOfDeployments bool `json:"notify_of_deployments"` + Action string `json:"action"` + Id model.ApplicationCategory `json:"id"` + db.ApplicationCategory + Test *struct { + Incident *db.ApplicationCategoryNotificationDestinations `json:"incident,omitempty"` + Deployment *db.ApplicationCategoryNotificationDestinations `json:"deployment,omitempty"` + } `json:"test,omitempty"` } func (f *ApplicationCategoryForm) Valid() bool { - if !slugRe.MatchString(string(f.NewName)) { + if f.Test != nil { + return true + } + if !slugRe.MatchString(string(f.Name)) { return false } - f.CustomPatterns = strings.Fields(f.CustomPatternsStr) - if !utils.GlobValidate(f.CustomPatterns) { + customPatterns := strings.Fields(f.CustomPatterns) + if !utils.GlobValidate(customPatterns) { return false } - for _, p := range f.CustomPatterns { + for _, p := range customPatterns { if strings.Count(p, "/") != 1 || strings.Index(p, "/") < 1 { return false } @@ -121,6 +135,49 @@ func (f *ApplicationCategoryForm) Valid() bool { return true } +func (f *ApplicationCategoryForm) SendTestNotification(ctx context.Context, project *db.Project) error { + if f.Test == nil { + return nil + } + integrations := project.Settings.Integrations + var client notifications.NotificationClient + switch { + case f.Test.Incident != nil: + if slack := f.Test.Incident.Slack; slack != nil && integrations.Slack != nil { + client = notifications.NewSlack(integrations.Slack.Token, cmp.Or(slack.Channel, integrations.Slack.DefaultChannel)) + } + if teams := f.Test.Incident.Teams; teams != nil && integrations.Teams != nil { + client = notifications.NewTeams(integrations.Teams.WebhookUrl) + } + if pagerduty := f.Test.Incident.Pagerduty; pagerduty != nil && integrations.Pagerduty != nil { + client = notifications.NewPagerduty(integrations.Pagerduty.IntegrationKey) + } + if opsgenie := f.Test.Incident.Opsgenie; opsgenie != nil && integrations.Opsgenie != nil { + client = notifications.NewOpsgenie(integrations.Opsgenie.ApiKey, integrations.Opsgenie.EUInstance) + } + if webhook := f.Test.Incident.Webhook; webhook != nil && integrations.Webhook != nil { + client = notifications.NewWebhook(integrations.Webhook) + } + if client != nil { + return client.SendIncident(ctx, integrations.BaseUrl, testIncidentNotification(project)) + } + case f.Test.Deployment != nil: + if slack := f.Test.Deployment.Slack; slack != nil && integrations.Slack != nil { + client = notifications.NewSlack(integrations.Slack.Token, cmp.Or(slack.Channel, integrations.Slack.DefaultChannel)) + } + if teams := f.Test.Deployment.Teams; teams != nil && integrations.Teams != nil { + client = notifications.NewTeams(integrations.Teams.WebhookUrl) + } + if webhook := f.Test.Deployment.Webhook; webhook != nil && integrations.Webhook != nil { + client = notifications.NewWebhook(integrations.Webhook) + } + if client != nil { + return client.SendDeployment(ctx, project, testDeploymentNotification()) + } + } + return nil +} + type CustomApplicationForm struct { Name string `json:"name"` NewName string `json:"new_name"` @@ -140,6 +197,17 @@ func (f *CustomApplicationForm) Valid() bool { return true } +type CustomCloudPricingForm struct { + db.CustomCloudPricing +} + +func (f *CustomCloudPricingForm) Valid() bool { + if f.PerCPUCore <= 0 || f.PerMemoryGb <= 0 { + return false + } + return true +} + type ApplicationInstrumentationForm struct { model.ApplicationInstrumentation } @@ -405,7 +473,7 @@ type IntegrationFormSlack struct { } func (f *IntegrationFormSlack) Valid() bool { - if f.Token == "" || f.DefaultChannel == "" { + if err := f.Validate(); err != nil { return false } return true @@ -442,7 +510,7 @@ type IntegrationFormTeams struct { } func (f *IntegrationFormTeams) Valid() bool { - if f.WebhookUrl == "" { + if err := f.Validate(); err != nil { return false } return true @@ -479,7 +547,7 @@ type IntegrationFormPagerduty struct { } func (f *IntegrationFormPagerduty) Valid() bool { - if f.IntegrationKey == "" { + if err := f.Validate(); err != nil { return false } return true @@ -515,7 +583,7 @@ type IntegrationFormOpsgenie struct { } func (f *IntegrationFormOpsgenie) Valid() bool { - if f.ApiKey == "" { + if err := f.Validate(); err != nil { return false } return true @@ -551,13 +619,7 @@ type IntegrationFormWebhook struct { } func (f *IntegrationFormWebhook) Valid() bool { - if f.Url == "" { - return false - } - if f.Incidents && f.IncidentTemplate == "" { - return false - } - if f.Deployments && f.DeploymentTemplate == "" { + if err := f.Validate(); err != nil { return false } return true @@ -606,7 +668,7 @@ func (f *IntegrationFormWebhook) Test(ctx context.Context, project *db.Project) func testIncidentNotification(project *db.Project) *db.IncidentNotification { return &db.IncidentNotification{ ProjectId: project.Id, - ApplicationId: model.NewApplicationId("default", model.ApplicationKindDeployment, "test-alert-fake-app"), + ApplicationId: model.NewApplicationId("default", model.ApplicationKindDeployment, "fake-app"), IncidentKey: "123ab456", Status: model.WARNING, Details: &db.IncidentNotificationDetails{ @@ -628,9 +690,11 @@ func testDeploymentNotification() model.ApplicationDeploymentStatus { {Report: model.AuditReportCPU, Ok: true, Message: "Memory: looks like the memory leak has been fixed"}, }, Deployment: &model.ApplicationDeployment{ - ApplicationId: model.NewApplicationId("default", model.ApplicationKindDeployment, "test-deployment-fake-app"), + ApplicationId: model.NewApplicationId("default", model.ApplicationKindDeployment, "fake-app"), Name: "123ab456", + StartedAt: timeseries.Now().Add(-model.ApplicationDeploymentMinLifetime), Details: &model.ApplicationDeploymentDetails{ContainerImages: []string{"app:v1.8.2"}}, + Notifications: &model.ApplicationDeploymentNotifications{}, }, } } diff --git a/api/views/application/application.go b/api/views/application/application.go index 2b4ff1374..727e407d8 100644 --- a/api/views/application/application.go +++ b/api/views/application/application.go @@ -30,6 +30,7 @@ type Application struct { Category model.ApplicationCategory `json:"category"` Custom bool `json:"custom"` Status model.Status `json:"status"` + Icon string `json:"icon"` Indicators []model.Indicator `json:"indicators"` Labels model.Labels `json:"labels"` @@ -52,6 +53,7 @@ func Render(world *model.World, app *model.Application) *View { Category: app.Category, Custom: app.Custom, Status: app.Status, + Icon: app.ApplicationType().Icon(), Indicators: model.CalcIndicators(app), Labels: app.Labels(), }, @@ -59,7 +61,6 @@ func Render(world *model.World, app *model.Application) *View { Categories: world.Categories, } - deps := map[model.ApplicationId]bool{} for _, instance := range app.Instances { if instance.IsObsolete() || instance.IsFailed() { continue @@ -86,13 +87,12 @@ func Render(world *model.World, app *model.Application) *View { for _, connection := range app.Upstreams { if connection.RemoteApplication.Id != app.Id { - deps[connection.RemoteApplication.Id] = true appMap.addDependency(connection) } } for _, connection := range app.Downstreams { if connection.Application.Id != app.Id { - appMap.addClient(connection, deps[connection.Application.Id]) + appMap.addClient(connection) } } sort.Slice(appMap.Instances, func(i1, i2 int) bool { @@ -126,6 +126,7 @@ func (m *AppMap) addDependency(c *model.AppToAppConnection) { Id: c.RemoteApplication.Id, Custom: c.RemoteApplication.Custom, Status: c.RemoteApplication.Status, + Icon: c.RemoteApplication.ApplicationType().Icon(), Indicators: model.CalcIndicators(c.RemoteApplication), Labels: c.RemoteApplication.Labels(), @@ -145,13 +146,11 @@ func (m *AppMap) addDependency(c *model.AppToAppConnection) { a.LinkStats = utils.FormatLinkStats(requests, latency, bytesSent, bytesReceived, reason) } -func (m *AppMap) addClient(c *model.AppToAppConnection, seenInDeps bool) { - if seenInDeps { - for _, d := range m.Dependencies { - if d.Id != c.Application.Id { - d.LinkDirection = "both" - return - } +func (m *AppMap) addClient(c *model.AppToAppConnection) { + for _, d := range m.Dependencies { + if d.Id == c.Application.Id { + d.LinkDirection = "both" + return } } status, reason := c.Status() @@ -159,6 +158,7 @@ func (m *AppMap) addClient(c *model.AppToAppConnection, seenInDeps bool) { Id: c.Application.Id, Custom: c.Application.Custom, Status: c.Application.Status, + Icon: c.Application.ApplicationType().Icon(), Indicators: model.CalcIndicators(c.Application), Labels: c.Application.Labels(), diff --git a/api/views/applications/categories.go b/api/views/applications/categories.go deleted file mode 100644 index 42317d198..000000000 --- a/api/views/applications/categories.go +++ /dev/null @@ -1,66 +0,0 @@ -package applications - -import ( - "sort" - "strings" - - "github.com/coroot/coroot/db" - "github.com/coroot/coroot/model" -) - -type CategoriesView struct { - Categories []Category `json:"categories"` - Integrations map[string]string `json:"integrations"` -} - -type Category struct { - Name model.ApplicationCategory `json:"name"` - Builtin bool `json:"builtin"` - Default bool `json:"default"` - BuiltinPatterns string `json:"builtin_patterns"` - CustomPatterns string `json:"custom_patterns"` - NotifyOfDeployments bool `json:"notify_of_deployments"` -} - -func RenderCategories(p *db.Project) *CategoriesView { - var categories []Category - for c, ps := range model.BuiltinCategoryPatterns { - categories = append(categories, Category{ - Name: c, - Builtin: c.Builtin(), - Default: c.Default(), - BuiltinPatterns: strings.Join(ps, " "), - CustomPatterns: strings.Join(p.Settings.ApplicationCategories[c], " "), - NotifyOfDeployments: p.Settings.ApplicationCategorySettings[c].NotifyOfDeployments, - }) - } - sort.Slice(categories, func(i, j int) bool { - return categories[i].Name < categories[j].Name - }) - - var custom []Category - for c, ps := range p.Settings.ApplicationCategories { - if _, ok := model.BuiltinCategoryPatterns[c]; ok { - continue - } - custom = append(custom, Category{ - Name: c, - CustomPatterns: strings.Join(ps, " "), - NotifyOfDeployments: p.Settings.ApplicationCategorySettings[c].NotifyOfDeployments, - }) - } - sort.Slice(custom, func(i, j int) bool { - return custom[i].Name < custom[j].Name - }) - - categories = append(categories, custom...) - - v := &CategoriesView{Categories: categories, Integrations: map[string]string{}} - - for _, i := range p.Settings.Integrations.GetInfo() { - if i.Configured && i.Deployments { - v.Integrations[i.Title] = i.Details - } - } - return v -} diff --git a/api/views/applications/custom_applications.go b/api/views/applications/custom_applications.go index 63cc734ed..3294ab40a 100644 --- a/api/views/applications/custom_applications.go +++ b/api/views/applications/custom_applications.go @@ -21,7 +21,7 @@ func RenderCustomApplications(p *db.Project) *CustomApplicationsView { for name, app := range p.Settings.CustomApplications { v.CustomApplications = append(v.CustomApplications, CustomApplication{ Name: name, - InstancePatterns: strings.Join(app.InstancePattens, " "), + InstancePatterns: strings.Join(app.InstancePatterns, " "), }) } sort.Slice(v.CustomApplications, func(i, j int) bool { diff --git a/api/views/dashboards/dashboards.go b/api/views/dashboards/dashboards.go new file mode 100644 index 000000000..de6fd1e16 --- /dev/null +++ b/api/views/dashboards/dashboards.go @@ -0,0 +1,36 @@ +package dashboards + +import "github.com/coroot/coroot/db" + +type Dashboards struct { +} + +type Dashboard struct { + Id string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + Config *db.DashboardConfig `json:"config,omitempty"` +} + +func (ds *Dashboards) List(dashboards []*db.Dashboard) []Dashboard { + res := make([]Dashboard, 0, len(dashboards)) + for _, d := range dashboards { + dd := Dashboard{ + Id: d.Id, + Name: d.Name, + Description: d.Description, + } + res = append(res, dd) + } + return res +} + +func (ds *Dashboards) Dashboard(dashboard *db.Dashboard) Dashboard { + dd := Dashboard{ + Id: dashboard.Id, + Name: dashboard.Name, + Description: dashboard.Description, + Config: &dashboard.Config, + } + return dd +} diff --git a/api/views/dashboards/panel.go b/api/views/dashboards/panel.go new file mode 100644 index 000000000..a2e5583fd --- /dev/null +++ b/api/views/dashboards/panel.go @@ -0,0 +1,61 @@ +package dashboards + +import ( + "context" + "fmt" + "regexp" + + "github.com/coroot/coroot/db" + "github.com/coroot/coroot/model" + "github.com/coroot/coroot/prom" + "github.com/coroot/coroot/timeseries" +) + +type PanelData struct { + Chart *model.Chart `json:"chart,omitempty"` +} + +func (ds *Dashboards) PanelData(ctx context.Context, pc *prom.Client, config db.DashboardPanel, from, to timeseries.Time, step timeseries.Duration) (*PanelData, error) { + var res PanelData + switch { + case config.Source.Metrics != nil: + for _, q := range config.Source.Metrics.Queries { + if q.Query == "" { + continue + } + mvs, err := pc.QueryRange(ctx, q.Query, prom.FilterLabelsKeepAll, from, to, step) + if err != nil { + return nil, err + } + for _, mv := range mvs { + name := q.Legend + if name != "" { + for k, v := range mv.Labels { + if r, _ := regexp.Compile(fmt.Sprintf(`{{\s*%s\s*}}`, k)); r != nil { + name = r.ReplaceAllString(name, v) + } + } + } + if name == "" { + name = mv.Labels.String() + } + if name == "" { + name = q.Query + } + if chart := config.Widget.Chart; chart != nil { + if res.Chart == nil { + res.Chart = model.NewChart(timeseries.NewContext(from, to, step), "") + } + if chart.Stacked { + res.Chart = res.Chart.Stacked() + } + if chart.Display == "bar" { + res.Chart = res.Chart.Column() + } + res.Chart.AddSeries(name, mv.Values) + } + } + } + } + return &res, nil +} diff --git a/api/views/inspections/inspections.go b/api/views/inspections/inspections.go index b84d28f10..786c84243 100644 --- a/api/views/inspections/inspections.go +++ b/api/views/inspections/inspections.go @@ -16,6 +16,7 @@ type Check struct { model.Check GlobalThreshold float32 `json:"global_threshold"` ProjectThreshold *float32 `json:"project_threshold"` + ProjectDetails string `json:"project_details"` ApplicationOverrides []Application `json:"application_overrides"` } @@ -71,18 +72,30 @@ func (v *View) addReport(name model.AuditReportName, checks ...model.CheckConfig } case []model.CheckConfigSLOAvailability: for _, c := range cfg { - ch.ApplicationOverrides = append(ch.ApplicationOverrides, Application{ - Id: appId, - Threshold: c.ObjectivePercentage, - }) + if appId.IsZero() { + t := c.ObjectivePercentage + ch.ProjectThreshold = &t + } else { + ch.ApplicationOverrides = append(ch.ApplicationOverrides, Application{ + Id: appId, + Threshold: c.ObjectivePercentage, + }) + } } case []model.CheckConfigSLOLatency: for _, c := range cfg { - ch.ApplicationOverrides = append(ch.ApplicationOverrides, Application{ - Id: appId, - Threshold: c.ObjectivePercentage, - Details: "< " + utils.FormatLatency(c.ObjectiveBucket), - }) + details := "< " + utils.FormatLatency(c.ObjectiveBucket) + if appId.IsZero() { + t := c.ObjectivePercentage + ch.ProjectThreshold = &t + ch.ProjectDetails = details + } else { + ch.ApplicationOverrides = append(ch.ApplicationOverrides, Application{ + Id: appId, + Threshold: c.ObjectivePercentage, + Details: details, + }) + } } default: klog.Warningln("unknown config type") diff --git a/api/views/integrations/integrations.go b/api/views/integrations/integrations.go deleted file mode 100644 index 87dbce635..000000000 --- a/api/views/integrations/integrations.go +++ /dev/null @@ -1,39 +0,0 @@ -package integrations - -import ( - "github.com/coroot/coroot/db" -) - -type View struct { - BaseUrl string `json:"base_url"` - Integrations []Integration `json:"integrations"` -} - -type Integration struct { - Type db.IntegrationType `json:"type"` - Title string `json:"title"` - Configured bool `json:"configured"` - Incidents bool `json:"incidents"` - Deployments bool `json:"deployments"` - Details string `json:"details"` -} - -func Render(p *db.Project) *View { - integrations := p.Settings.Integrations - v := &View{ - BaseUrl: integrations.BaseUrl, - } - - for _, i := range integrations.GetInfo() { - v.Integrations = append(v.Integrations, Integration{ - Type: i.Type, - Title: i.Title, - Configured: i.Configured, - Incidents: i.Incidents, - Deployments: i.Deployments, - Details: i.Details, - }) - } - - return v -} diff --git a/api/views/logs/logs.go b/api/views/logs/logs.go index 53a395212..14d570877 100644 --- a/api/views/logs/logs.go +++ b/api/views/logs/logs.go @@ -1,10 +1,12 @@ package logs import ( + "cmp" "context" "encoding/json" "fmt" "net/url" + "slices" "sort" "strings" @@ -23,24 +25,23 @@ const ( ) type View struct { - Status model.Status `json:"status"` - Message string `json:"message"` - Sources []model.LogSource `json:"sources"` - Source model.LogSource `json:"source"` - Services []string `json:"services"` - Service string `json:"service"` - Views []string `json:"views"` - View string `json:"view"` - Severities []string `json:"severities"` - Severity []string `json:"severity"` - Chart *model.Chart `json:"chart"` - Entries []Entry `json:"entries"` - Patterns []*Pattern `json:"patterns"` - Limit int `json:"limit"` + Status model.Status `json:"status"` + Message string `json:"message"` + Sources []model.LogSource `json:"sources"` + Source model.LogSource `json:"source"` + Services []string `json:"services"` + Service string `json:"service"` + View string `json:"view"` + Chart *model.Chart `json:"chart"` + Entries []Entry `json:"entries"` + Patterns []*Pattern `json:"patterns"` + Limit int `json:"limit"` + Suggest []string `json:"suggest"` } type Pattern struct { Severity string `json:"severity"` + Color string `json:"color"` Sample string `json:"sample"` Sum uint64 `json:"sum"` Chart *model.Chart `json:"chart"` @@ -50,21 +51,24 @@ type Pattern struct { type Entry struct { Timestamp int64 `json:"timestamp"` Severity string `json:"severity"` + Color string `json:"color"` Message string `json:"message"` Attributes map[string]string `json:"attributes"` + TraceId string `json:"trace_id"` } type Query struct { - Source model.LogSource `json:"source"` - View string `json:"view"` - Severity []string `json:"severity"` - Search string `json:"search"` - Hash string `json:"hash"` - Limit int `json:"limit"` + Source model.LogSource `json:"source"` + View string `json:"view"` + Filters []clickhouse.LogFilter `json:"filters"` + Limit int `json:"limit"` + Suggest *string `json:"suggest,omitempty"` } func Render(ctx context.Context, ch *clickhouse.Client, app *model.Application, query url.Values, w *model.World) *View { - v := &View{} + v := &View{ + Status: model.OK, + } var q Query if s := query.Get("query"); s != "" { @@ -92,40 +96,19 @@ func Render(ctx context.Context, ch *clickhouse.Client, app *model.Application, return v } - v.View = q.View - if v.View == "" { - v.View = viewMessages - } - renderEntries(ctx, v, ch, app, w, q) - - if v.Status == model.UNKNOWN { - v.View = viewPatterns - renderPatterns(v, app, w.Ctx) - return v - } - - v.Views = append(v.Views, viewMessages) - if v.Source == model.LogSourceAgent { - v.Views = append(v.Views, viewPatterns) - if v.View == viewPatterns { - renderPatterns(v, app, w.Ctx) - } - } - return v -} + v.View = cmp.Or(q.View, viewMessages) -func renderEntries(ctx context.Context, v *View, ch *clickhouse.Client, app *model.Application, w *model.World, q Query) { services, err := ch.GetServicesFromLogs(ctx, w.Ctx.From) if err != nil { klog.Errorln(err) v.Status = model.WARNING v.Message = fmt.Sprintf("Clickhouse error: %s", err) - return + return v } var logsFromAgentFound bool var otelServices []string - for s := range services { + for _, s := range services { if strings.HasPrefix(s, "/") { logsFromAgentFound = true } else { @@ -150,12 +133,14 @@ func renderEntries(ctx context.Context, v *View, ch *clickhouse.Client, app *mod } v.Services = append(v.Services, s) } - sort.Strings(v.Services) + slices.Sort(v.Services) if len(v.Sources) == 0 { v.Status = model.UNKNOWN v.Message = "No logs found in ClickHouse" - return + v.View = viewPatterns + renderPatterns(v, app, w.Ctx) + return v } v.Source = q.Source @@ -166,49 +151,60 @@ func renderEntries(ctx context.Context, v *View, ch *clickhouse.Client, app *mod v.Source = model.LogSourceAgent } } - v.Severity = q.Severity - - var histogram map[string]*timeseries.TimeSeries - var entries []*model.LogEntry switch v.Source { case model.LogSourceOtel: v.Message = fmt.Sprintf("Using OpenTelemetry logs of %s", otelService) - v.Severities = services[v.Service] - if len(v.Severity) == 0 { - v.Severity = v.Severities - } - if v.View == viewMessages { - histogram, err = ch.GetServiceLogsHistogram(ctx, w.Ctx.From, w.Ctx.To, w.Ctx.Step, otelService, v.Severity, q.Search) - if err == nil { - entries, err = ch.GetServiceLogs(ctx, w.Ctx.From, w.Ctx.To, otelService, v.Severity, q.Search, q.Limit) - } - } case model.LogSourceAgent: v.Message = "Using container logs" - containers := map[string][]string{} - severities := utils.NewStringSet() - for _, i := range app.Instances { - for _, c := range i.Containers { - s := model.ContainerIdToServiceName(c.Id) - containers[s] = append(containers[s], c.Id) - severities.Add(services[s]...) + } + + switch v.View { + case viewPatterns: + renderPatterns(v, app, w.Ctx) + case viewMessages: + renderEntries(ctx, v, ch, app, w, q, otelService) + } + + return v +} + +func renderEntries(ctx context.Context, v *View, ch *clickhouse.Client, app *model.Application, w *model.World, q Query, otelService string) { + if len(app.Instances) == 0 { + return + } + var err error + lq := clickhouse.LogQuery{ + Ctx: w.Ctx, + Filters: q.Filters, + Limit: q.Limit, + } + switch v.Source { + case model.LogSourceOtel: + lq.Services = []string{otelService} + case model.LogSourceAgent: + lq.Services = getServices(app) + hashes := utils.NewStringSet() + for _, f := range q.Filters { + if f.Name == "pattern.hash" { + hashes.Add(getSimilarHashes(app, f.Value)...) } } - v.Severities = severities.Items() - if len(v.Severity) == 0 { - v.Severity = v.Severities + for _, hash := range hashes.Items() { + lq.Filters = append(lq.Filters, clickhouse.LogFilter{Name: "pattern.hash", Op: "=", Value: hash}) } - if v.View == viewMessages { - var hashes []string - if q.Hash != "" { - hashes = getSimilarHashes(app, q.Hash) - } - histogram, err = ch.GetContainerLogsHistogram(ctx, w.Ctx.From, w.Ctx.To, w.Ctx.Step, containers, v.Severity, hashes, q.Search) - if err == nil { - entries, err = ch.GetContainerLogs(ctx, w.Ctx.From, w.Ctx.To, containers, v.Severity, hashes, q.Search, q.Limit) - } + } + + var histogram []model.LogHistogramBucket + var entries []*model.LogEntry + if q.Suggest != nil { + v.Suggest, err = ch.GetLogFilters(ctx, lq, *q.Suggest) + } else { + histogram, err = ch.GetLogsHistogram(ctx, lq) + if err == nil { + entries, err = ch.GetLogs(ctx, lq) } } + if err != nil { klog.Errorln(err) v.Status = model.WARNING @@ -216,21 +212,21 @@ func renderEntries(ctx context.Context, v *View, ch *clickhouse.Client, app *mod return } - v.Status = model.OK - if len(histogram) > 0 { - v.Chart = model.NewChart(w.Ctx, "").Column() - for severity, ts := range histogram { - v.Chart.AddSeries(severity, ts) + v.Chart = model.NewChart(w.Ctx, "").Column().Sorted() + for _, b := range histogram { + v.Chart.AddSeries(b.Severity.String(), b.Timeseries, b.Severity.Color()) } } for _, e := range entries { entry := Entry{ Timestamp: e.Timestamp.UnixMilli(), - Severity: e.Severity, + Severity: e.Severity.String(), + Color: e.Severity.Color(), Message: e.Body, Attributes: map[string]string{}, + TraceId: e.TraceId, } for name, value := range e.LogAttributes { if name != "" && value != "" { @@ -242,9 +238,6 @@ func renderEntries(ctx context.Context, v *View, ch *clickhouse.Client, app *mod entry.Attributes[name] = value } } - if e.TraceId != "" { - entry.Attributes["trace.id"] = e.TraceId - } v.Entries = append(v.Entries, entry) } if len(v.Entries) >= q.Limit { @@ -253,23 +246,23 @@ func renderEntries(ctx context.Context, v *View, ch *clickhouse.Client, app *mod } func renderPatterns(v *View, app *model.Application, ctx timeseries.Context) { - bySeverity := map[string]*timeseries.Aggregate{} - for level, msgs := range app.LogMessages { + bySeverity := map[model.Severity]*timeseries.Aggregate{} + for severity, msgs := range app.LogMessages { for hash, pattern := range msgs.Patterns { sum := pattern.Messages.Reduce(timeseries.NanSum) if timeseries.IsNaN(sum) || sum == 0 { continue } - severity := string(level) if bySeverity[severity] == nil { bySeverity[severity] = timeseries.NewAggregate(timeseries.NanSum) } bySeverity[severity].Add(pattern.Messages) p := &Pattern{ - Severity: severity, + Severity: severity.String(), + Color: severity.Color(), Sample: pattern.Sample, Sum: uint64(sum), - Chart: model.NewChart(ctx, "").AddSeries(severity, pattern.Messages).Column().Legend(false), + Chart: model.NewChart(ctx, "").AddSeries(severity.String(), pattern.Messages, severity.Color()).Column().Legend(false), Hash: hash, } v.Patterns = append(v.Patterns, p) @@ -281,11 +274,21 @@ func renderPatterns(v *View, app *model.Application, ctx timeseries.Context) { if len(bySeverity) > 0 { v.Chart = model.NewChart(ctx, "").Column() for severity, ts := range bySeverity { - v.Chart.AddSeries(severity, ts.Get()) + v.Chart.AddSeries(severity.String(), ts.Get(), severity.Color()) } } } +func getServices(app *model.Application) []string { + res := utils.NewStringSet() + for _, i := range app.Instances { + for _, c := range i.Containers { + res.Add(model.ContainerIdToServiceName(c.Id)) + } + } + return res.Items() +} + func getSimilarHashes(app *model.Application, hash string) []string { res := utils.NewStringSet() for _, msgs := range app.LogMessages { diff --git a/api/views/overview/applications.go b/api/views/overview/applications.go index 61ce29885..03a6a856d 100644 --- a/api/views/overview/applications.go +++ b/api/views/overview/applications.go @@ -9,7 +9,6 @@ import ( "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" "github.com/dustin/go-humanize/english" - "golang.org/x/exp/maps" ) type ApplicationStatus struct { @@ -301,26 +300,10 @@ func formatPercent(v float32) string { } func getApplicationType(app *model.Application) *ApplicationType { - types := maps.Keys(app.ApplicationTypes()) - if len(types) == 0 { + t := app.ApplicationType() + if t == model.ApplicationTypeUnknown { return nil } - - var t model.ApplicationType - if len(types) == 1 { - t = types[0] - } else { - sort.Slice(types, func(i, j int) bool { - ti, tj := types[i], types[j] - tiw, tjw := ti.Weight(), tj.Weight() - if tiw == tjw { - return ti < tj - } - return tiw < tjw - }) - t = types[0] - } - report := t.AuditReport() hasReport := false var status model.Status diff --git a/api/views/overview/costs.go b/api/views/overview/costs.go index a32d0860f..3d4a17698 100644 --- a/api/views/overview/costs.go +++ b/api/views/overview/costs.go @@ -14,8 +14,9 @@ const ( ) type Costs struct { - Nodes []*NodeCosts `json:"nodes"` - Applications []*ApplicationCosts `json:"applications"` + Nodes []*NodeCosts `json:"nodes"` + Applications []*ApplicationCosts `json:"applications"` + CustomPricing bool `json:"custom_pricing"` } type NodeCosts struct { @@ -97,6 +98,9 @@ func renderCosts(w *model.World) *Costs { if n.Price == nil { continue } + if n.Price.Custom { + res.CustomPricing = true + } if dataTransferPrice == nil && n.DataTransferPrice != nil { dataTransferPrice = n.DataTransferPrice } diff --git a/api/views/overview/logs.go b/api/views/overview/logs.go new file mode 100644 index 000000000..ce6b79f65 --- /dev/null +++ b/api/views/overview/logs.go @@ -0,0 +1,161 @@ +package overview + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/coroot/coroot/clickhouse" + "github.com/coroot/coroot/model" + "github.com/coroot/coroot/utils" + "k8s.io/klog" +) + +const ( + defaultLimit = 100 +) + +type Logs struct { + Message string `json:"message"` + Error string `json:"error"` + Chart *model.Chart `json:"chart"` + Entries []LogEntry `json:"entries"` + Suggest []string `json:"suggest"` +} + +type LogEntry struct { + Application string `json:"application"` + Timestamp int64 `json:"timestamp"` + Severity string `json:"severity"` + Color string `json:"color"` + Message string `json:"message"` + Attributes map[string]string `json:"attributes"` + TraceId string `json:"trace_id"` +} + +type LogsQuery struct { + View string `json:"view"` + Agent bool `json:"agent"` + Otel bool `json:"otel"` + Filters []clickhouse.LogFilter `json:"filters"` + Limit int `json:"limit"` + Suggest *string `json:"suggest,omitempty"` +} + +func renderLogs(ctx context.Context, ch *clickhouse.Client, w *model.World, query string) *Logs { + v := &Logs{} + + if ch == nil { + v.Message = "Clickhouse integration is not configured." + return v + } + + var q LogsQuery + if query != "" { + if err := json.Unmarshal([]byte(query), &q); err != nil { + klog.Warningln(err) + } + } + if !q.Agent && !q.Otel { + return v + } + if q.Limit <= 0 { + q.Limit = defaultLimit + } + + lq := clickhouse.LogQuery{ + Ctx: w.Ctx, + Filters: q.Filters, + Limit: q.Limit, + } + + if !q.Agent || !q.Otel { + if q.Agent { + lq.Source = model.LogSourceAgent + } + if q.Otel { + lq.Source = model.LogSourceOtel + } + } + + var histogram []model.LogHistogramBucket + var entries []*model.LogEntry + var err error + if q.Suggest != nil { + v.Suggest, err = ch.GetLogFilters(ctx, lq, *q.Suggest) + } else { + histogram, err = ch.GetLogsHistogram(ctx, lq) + if err == nil { + entries, err = ch.GetLogs(ctx, lq) + } + } + + if err != nil { + klog.Errorln(err) + v.Error = fmt.Sprintf("Clickhouse error: %s", err) + return v + } + + if len(histogram) > 0 { + v.Chart = model.NewChart(w.Ctx, "").Column().Sorted() + for _, b := range histogram { + v.Chart.AddSeries(b.Severity.String(), b.Timeseries, b.Severity.Color()) + } + } + + v.renderEntries(entries, w) + + return v +} + +func (v *Logs) renderEntries(entries []*model.LogEntry, w *model.World) { + if len(entries) == 0 { + return + } + + ss := utils.NewStringSet() + for _, e := range entries { + ss.Add(e.ServiceName) + } + services := ss.Items() + + apps := map[string]*model.Application{} + for _, app := range w.Applications { + for _, i := range app.Instances { + for _, c := range i.Containers { + apps[model.ContainerIdToServiceName(c.Id)] = app + } + } + if settings := app.Settings; settings != nil && settings.Logs != nil && settings.Logs.Service != "" { + apps[settings.Logs.Service] = app + } else if service := model.GuessService(services, app.Id); service != "" { + apps[service] = app + } + } + + for _, e := range entries { + entry := LogEntry{ + Application: e.ServiceName, + Timestamp: e.Timestamp.UnixMilli(), + Severity: e.Severity.String(), + Color: e.Severity.Color(), + Message: e.Body, + Attributes: map[string]string{}, + TraceId: e.TraceId, + } + if app := apps[e.ServiceName]; app != nil { + entry.Application = app.Id.String() + } + for name, value := range e.LogAttributes { + if name != "" && value != "" { + entry.Attributes[name] = value + } + } + for name, value := range e.ResourceAttributes { + if name != "" && value != "" { + entry.Attributes[name] = value + } + } + v.Entries = append(v.Entries, entry) + } +} diff --git a/api/views/overview/nodes.go b/api/views/overview/nodes.go index c18073769..797be1240 100644 --- a/api/views/overview/nodes.go +++ b/api/views/overview/nodes.go @@ -11,7 +11,7 @@ import ( ) func renderNodes(w *model.World) *model.Table { - nodes := model.NewTable("Node", "Status", "Availability zone", "IP", "CPU", "Memory", "Network") + nodes := model.NewTable("Node", "Status", "Availability zone", "IP", "CPU", "Memory", "GPU", "Network") unknown := model.NewTable(nodes.Header...) for _, n := range w.Nodes { name := n.GetName() @@ -85,6 +85,10 @@ func renderNodes(w *model.World) *model.Table { if *status.Status == model.UNKNOWN { table = unknown } + gpus := model.NewTableCell() + if len(n.GPUs) > 0 { + gpus.SetValue(strconv.Itoa(len(n.GPUs))) + } table.AddRow( node, status, @@ -92,6 +96,7 @@ func renderNodes(w *model.World) *model.Table { model.NewTableCell(ips.Items()...), cpuPercent, memoryPercent, + gpus, network, ) } diff --git a/api/views/overview/overview.go b/api/views/overview/overview.go index 6dfc05544..29ee46b7c 100644 --- a/api/views/overview/overview.go +++ b/api/views/overview/overview.go @@ -15,6 +15,7 @@ type Overview struct { Nodes *model.Table `json:"nodes"` Deployments []*Deployment `json:"deployments"` Traces *Traces `json:"traces"` + Logs *Logs `json:"logs"` Costs *Costs `json:"costs"` Risks []*Risk `json:"risks"` Categories []model.ApplicationCategory `json:"categories"` @@ -49,6 +50,8 @@ func Render(ctx context.Context, ch *clickhouse.Client, w *model.World, view, qu v.Deployments = renderDeployments(w) case "traces": v.Traces = renderTraces(ctx, ch, w, query) + case "logs": + v.Logs = renderLogs(ctx, ch, w, query) case "costs": v.Costs = renderCosts(w) case "risks": diff --git a/api/views/overview/risks.go b/api/views/overview/risks.go index 3fc529265..b510fb183 100644 --- a/api/views/overview/risks.go +++ b/api/views/overview/risks.go @@ -1,6 +1,7 @@ package overview import ( + "fmt" "sort" "github.com/coroot/coroot/model" @@ -17,6 +18,7 @@ type Risk struct { Type string `json:"type"` Dismissal *model.RiskDismissal `json:"dismissal,omitempty"` Exposure *Exposure `json:"exposure,omitempty"` + Availability *Availability `json:"availability,omitempty"` } type Exposure struct { @@ -26,7 +28,174 @@ type Exposure struct { LoadBalancerServices []string `json:"load_balancer_services"` } +type Availability struct { + Description string `json:"description"` +} + func renderRisks(w *model.World) []*Risk { + res := dbPortExposures(w) + res = append(res, availabilityRisks(w)...) + + sort.Slice(res, func(i, j int) bool { + if res[i].Severity == res[j].Severity { + return res[i].ApplicationId.Name < res[j].ApplicationId.Name + } + return res[i].Severity > res[j].Severity + }) + return res +} + +func availabilityRisks(w *model.World) []*Risk { + var res []*Risk + zones := utils.NewStringSet() + seenOnDemandNodes := false + for _, n := range w.Nodes { + if az := n.AvailabilityZone.Value(); az != "" { + zones.Add(az) + } + if lc := n.InstanceLifeCycle.Value(); lc == "on-demand" { + seenOnDemandNodes = true + } + } + + for _, app := range w.Applications { + switch app.Id.Kind { + case model.ApplicationKindExternalService, model.ApplicationKindRds, model.ApplicationKindElasticacheCluster, + model.ApplicationKindJob, model.ApplicationKindCronJob: + continue + } + dismissals := map[model.RiskKey]*model.RiskDismissal{} + if app.Settings != nil { + for _, ro := range app.Settings.RiskOverrides { + dismissals[ro.Key] = ro.Dismissal + } + } + appZones := utils.NewStringSet() + appNodes := utils.NewStringSet() + instanceLifeCycles := utils.NewStringSet() + availableInstances := 0 + availableInstancesByAppType := map[model.ApplicationType]int{} + for _, i := range app.Instances { + if !i.IsObsolete() && i.IsUp() { + availableInstances++ + for t := range i.ApplicationTypes() { + availableInstancesByAppType[t]++ + } + if i.Node != nil { + if z := i.Node.AvailabilityZone.Value(); z != "" { + appZones.Add(z) + } + appNodes.Add(i.NodeName()) + lc := i.Node.InstanceLifeCycle.Value() + if lc == "preemptible" { + lc = "spot" + } + instanceLifeCycles.Add(lc) + } + } + } + switch { + case app.IsStandalone(): + case availableInstances == 1 && len(w.Nodes) > 1: + res = append(res, availabilityRisk( + app, + dismissals, + model.WARNING, + model.RiskTypeSingleInstanceApp, + "Single instance - not resilient to node failure", + )) + case appNodes.Len() == 1 && len(w.Nodes) > 1: + res = append(res, availabilityRisk( + app, + dismissals, + model.WARNING, + model.RiskTypeSingleNodeApp, + "All instances on one node - not resilient to node failure", + )) + case appZones.Len() == 1 && zones.Len() > 1: + res = append(res, availabilityRisk( + app, + dismissals, + model.WARNING, + model.RiskTypeSingleAzApp, + "All instances in one Availability Zone - failure causes downtime", + )) + case seenOnDemandNodes && instanceLifeCycles.Len() == 1 && instanceLifeCycles.Items()[0] == "spot": + res = append(res, availabilityRisk( + app, + dismissals, + model.WARNING, + model.RiskTypeSpotOnlyApp, + "All instances on Spot nodes - risk of sudden termination. Add On-Demand", + )) + } + appTypes := app.ApplicationTypes() + for _, t := range []model.ApplicationType{ + model.ApplicationTypeMysql, model.ApplicationTypePostgres, + model.ApplicationTypeRedis, model.ApplicationTypeDragonfly, model.ApplicationTypeKeyDB, model.ApplicationTypeValkey, + model.ApplicationTypeMongodb, + model.ApplicationTypeElasticsearch, model.ApplicationTypeOpensearch, + } { + if !appTypes[t] { + continue + } + replicated := false + for _, u := range app.Upstreams { + if u.RemoteApplication == app { + continue + } + if u.RemoteApplication.ApplicationTypes()[t] { + replicated = true + } + } + if !replicated { + for _, u := range app.Downstreams { + if u.Application == app { + continue + } + if u.Application.ApplicationTypes()[t] { + replicated = true + } + } + } + if availableInstancesByAppType[t] > 0 && availableInstancesByAppType[t] < 2 && !replicated { + res = append(res, availabilityRisk( + app, + dismissals, + model.CRITICAL, + model.RiskTypeUnreplicatedDatabase, + "%s isn’t replicated - data loss possible", + utils.Capitalize(string(t)), + )) + } + } + } + return res +} + +func availabilityRisk(app *model.Application, dismissals map[model.RiskKey]*model.RiskDismissal, status model.Status, typ model.RiskType, format string, args ...any) *Risk { + key := model.RiskKey{ + Category: model.RiskCategoryAvailability, + Type: typ, + } + dismissal := dismissals[key] + if dismissal != nil { + status = model.OK + } + return &Risk{ + Key: key, + ApplicationId: app.Id, + ApplicationCategory: app.Category, + ApplicationType: getApplicationType(app), + Severity: status, + Dismissal: dismissal, + Availability: &Availability{ + Description: fmt.Sprintf(format, args...), + }, + } +} + +func dbPortExposures(w *model.World) []*Risk { var res []*Risk nodePublicIPs := utils.StringSet{} @@ -111,8 +280,5 @@ func renderRisks(w *model.World) []*Risk { }) } } - sort.Slice(res, func(i, j int) bool { - return res[i].ApplicationId.Name < res[j].ApplicationId.Name - }) return res } diff --git a/api/views/overview/service_map.go b/api/views/overview/service_map.go index caec60154..9c148a575 100644 --- a/api/views/overview/service_map.go +++ b/api/views/overview/service_map.go @@ -12,6 +12,7 @@ type Application struct { Category model.ApplicationCategory `json:"category"` Labels model.Labels `json:"labels"` Status model.Status `json:"status"` + Icon string `json:"icon"` Indicators []model.Indicator `json:"indicators"` Upstreams []Link `json:"upstreams"` @@ -36,6 +37,7 @@ func renderServiceMap(w *model.World) []*Application { Category: a.Category, Labels: a.Labels(), Status: a.Status, + Icon: a.ApplicationType().Icon(), Indicators: model.CalcIndicators(a), Upstreams: []Link{}, Downstreams: []Link{}, diff --git a/api/views/tracing/tracing.go b/api/views/tracing/tracing.go index 18b5cd585..86883b370 100644 --- a/api/views/tracing/tracing.go +++ b/api/views/tracing/tracing.go @@ -345,7 +345,7 @@ func getAppClients(app *model.Application) map[string]*model.Application { if client == nil || client == app { continue } - if !app.Category.Monitoring() && client.Category.Monitoring() { + if app.Id.Kind != model.ApplicationKindExternalService && !app.Category.Monitoring() && client.Category.Monitoring() { continue } for _, i := range client.Instances { diff --git a/api/views/views.go b/api/views/views.go index b8f4a1fef..1bac42063 100644 --- a/api/views/views.go +++ b/api/views/views.go @@ -7,9 +7,9 @@ import ( "github.com/coroot/coroot/api/views/application" "github.com/coroot/coroot/api/views/applications" "github.com/coroot/coroot/api/views/aws" + "github.com/coroot/coroot/api/views/dashboards" "github.com/coroot/coroot/api/views/incident" "github.com/coroot/coroot/api/views/inspections" - "github.com/coroot/coroot/api/views/integrations" "github.com/coroot/coroot/api/views/logs" "github.com/coroot/coroot/api/views/overview" "github.com/coroot/coroot/api/views/profiling" @@ -51,18 +51,10 @@ func Inspections(checkConfigs model.CheckConfigs) *inspections.View { return inspections.Render(checkConfigs) } -func Categories(p *db.Project) *applications.CategoriesView { - return applications.RenderCategories(p) -} - func CustomApplications(p *db.Project) *applications.CustomApplicationsView { return applications.RenderCustomApplications(p) } -func Integrations(p *db.Project) *integrations.View { - return integrations.Render(p) -} - func AWS(w *model.World) *aws.View { return aws.Render(w) } @@ -78,3 +70,7 @@ func Users(us []*db.User, rs []rbac.Role) *users.Users { func User(u *db.User, projects map[db.ProjectId]string, viewonly bool) *users.User { return users.RenderUser(u, projects, viewonly) } + +var ( + Dashboards = &dashboards.Dashboards{} +) diff --git a/auditor/auditor.go b/auditor/auditor.go index de6bd0881..830d63972 100644 --- a/auditor/auditor.go +++ b/auditor/auditor.go @@ -55,6 +55,7 @@ func Audit(w *model.World, p *db.Project, generateDetailedReportFor *model.Appli stages.stage("cpu", func() { a.cpu(ncs) }) stages.stage("memory", func() { a.memory(ncs) }) stages.stage("storage", a.storage) + stages.stage("gpu", a.gpu) stages.stage("network", a.network) stages.stage("dns", a.dns) stages.stage("postgres", a.postgres) diff --git a/auditor/dns.go b/auditor/dns.go index 802325d00..61b42ae60 100644 --- a/auditor/dns.go +++ b/auditor/dns.go @@ -50,81 +50,69 @@ func (a *appAuditor) dns() { "DNS latency, seconds", nil, ) - hist := map[float32]*timeseries.Aggregate{} byType := map[string]*timeseries.Aggregate{} errors := map[string]*timeseries.Aggregate{} byDomain := map[string]*DNSStats{} seenDNSRequests := false - for _, instance := range a.app.Instances { - for _, container := range instance.Containers { - for r, byStatus := range container.DNSRequests { - for status, ts := range byStatus { - if !seenDNSRequests && ts.Reduce(timeseries.NanSum) > 0 { - seenDNSRequests = true - } - d := byDomain[r.Domain] - if d == nil { - d = &DNSStats{Domain: r.Domain} - byDomain[r.Domain] = d - } - v := ts.Reduce(timeseries.NanSum) - if timeseries.IsNaN(v) { - continue - } - total := uint64(math.Round(float64(v) * float64(a.w.Ctx.Step))) - var st *DnsTypeStats - switch r.Type { - case "TypeA": - st = &d.A - case "TypeAAAA": - - st = &d.AAAA - default: - st = &d.Other - } - st.Requests += total - switch status { - case "ok": - case "nxdomain": - st.NxDomain += total - default: - serverErrorsCheck.Inc(int64(total)) - st.ServFail += total - } - if requestsChart != nil { - t := byType[r.Type] - if t == nil { - t = timeseries.NewAggregate(timeseries.NanSum) - byType[r.Type] = t - } - t.Add(ts) - - if status != "ok" { - label := r.Type + ":" + status - e := errors[label] - if e == nil { - e = timeseries.NewAggregate(timeseries.NanSum) - errors[label] = e - } - e.Add(ts) - } - } - } + for r, byStatus := range a.app.DNSRequests { + for status, ts := range byStatus { + if !seenDNSRequests && ts.Reduce(timeseries.NanSum) > 0 { + seenDNSRequests = true } - for b, ts := range container.DNSRequestsHistogram { - v := hist[b] - if v == nil { - v = timeseries.NewAggregate(timeseries.NanSum) - hist[b] = v + d := byDomain[r.Domain] + if d == nil { + d = &DNSStats{Domain: r.Domain} + byDomain[r.Domain] = d + } + v := ts.Reduce(timeseries.NanSum) + if timeseries.IsNaN(v) { + continue + } + total := uint64(math.Round(float64(v) * float64(a.w.Ctx.Step))) + var st *DnsTypeStats + switch r.Type { + case "TypeA": + st = &d.A + case "TypeAAAA": + + st = &d.AAAA + default: + st = &d.Other + } + st.Requests += total + switch status { + case "ok": + case "nxdomain": + st.NxDomain += total + default: + serverErrorsCheck.Inc(int64(total)) + st.ServFail += total + } + if requestsChart != nil { + t := byType[r.Type] + if t == nil { + t = timeseries.NewAggregate(timeseries.NanSum) + byType[r.Type] = t } - v.Add(ts) + t.Add(ts) + + if status != "ok" { + label := r.Type + ":" + status + e := errors[label] + if e == nil { + e = timeseries.NewAggregate(timeseries.NanSum) + errors[label] = e + } + e.Add(ts) + } + } } } - buckets := make([]model.HistogramBucket, 0, len(hist)) - for le, ts := range hist { + buckets := make([]model.HistogramBucket, 0, len(a.app.DNSRequestsHistogram)) + for le, ts := range a.app.DNSRequestsHistogram { buckets = append(buckets, model.HistogramBucket{Le: le, TimeSeries: ts.Get()}) } sort.Slice(buckets, func(i, j int) bool { diff --git a/auditor/gpu.go b/auditor/gpu.go new file mode 100644 index 000000000..9725342f2 --- /dev/null +++ b/auditor/gpu.go @@ -0,0 +1,115 @@ +package auditor + +import ( + "fmt" + + "github.com/coroot/coroot/model" + "github.com/coroot/coroot/timeseries" + "github.com/coroot/coroot/utils" + "github.com/dustin/go-humanize" +) + +type gpuInfo struct { + instances *utils.StringSet + node *model.Node + gpu *model.GPU +} + +func (a *appAuditor) gpu() { + report := a.addReport(model.AuditReportGPU) + + table := report.GetOrCreateTable("GPU", "Name", "vRAM", "Node", "Instances") + usageChart := report.GetOrCreateChart(fmt.Sprintf("GPU usage by %s, %%", a.app.Id.Name), nil) + memoryUsageChart := report.GetOrCreateChart(fmt.Sprintf("GPU memory usage by %s, %%", a.app.Id.Name), nil) + + relatedGPUs := map[string]*gpuInfo{} + + seenGPUs := false + for _, i := range a.app.Instances { + if i.IsObsolete() { + continue + } + total := timeseries.NewAggregate(timeseries.NanSum) + memory := timeseries.NewAggregate(timeseries.NanSum) + for uuid, u := range i.GPUUsage { + seenGPUs = true + total.Add(u.UsageAverage) + memory.Add(u.MemoryUsageAverage) + + if i.Node != nil && i.Node.GPUs != nil { + if gpu := i.Node.GPUs[uuid]; gpu != nil { + gi := relatedGPUs[uuid] + if gi == nil { + gi = &gpuInfo{ + instances: utils.NewStringSet(), + node: i.Node, + gpu: gpu, + } + relatedGPUs[uuid] = gi + } + gi.instances.Add(i.Name) + } + } + } + usageChart.AddSeries(i.Name, total) + memoryUsageChart.AddSeries(i.Name, memory) + } + for uuid, gi := range relatedGPUs { + mem := model.NewTableCell() + if last := gi.gpu.TotalMemory.Last(); last > 0 { + mem.SetValue(humanize.Bytes(uint64(last))) + } + + node := model.NewTableCell().SetStatus(gi.node.Status(), gi.node.GetName()) + node.Link = model.NewRouterLink(gi.node.GetName(), "overview"). + SetParam("view", "nodes"). + SetParam("id", gi.node.GetName()) + + table.AddRow( + model.NewTableCell(uuid), + model.NewTableCell(gi.gpu.Name.Value()), + mem, + node, + model.NewTableCell(gi.instances.Items()...), + ) + report. + GetOrCreateChartGroup("GPU utilization , %", nil). + GetOrCreateChart("average"). + AddSeries(uuid, gi.gpu.UsageAverage).Feature() + report. + GetOrCreateChartGroup("GPU utilization , %", nil). + GetOrCreateChart("peak"). + AddSeries(uuid, gi.gpu.UsagePeak) + report. + GetOrCreateChartGroup("GPU Memory utilization , %", nil). + GetOrCreateChart("average"). + AddSeries(uuid, gi.gpu.MemoryUsageAverage).Feature() + report. + GetOrCreateChartGroup("GPU Memory utilization , %", nil). + GetOrCreateChart("peak"). + AddSeries(uuid, gi.gpu.MemoryUsagePeak).Feature() + + coreChart := report. + GetOrCreateChartGroup("GPU consumers , %", nil). + GetOrCreateChart(uuid).Stacked() + memChart := report. + GetOrCreateChartGroup("GPU memory consumers , %", nil). + GetOrCreateChart(uuid).Stacked() + for _, ci := range gi.gpu.Instances { + if u := ci.GPUUsage[uuid]; u != nil { + coreChart.AddSeries(ci.Name, u.UsageAverage) + memChart.AddSeries(ci.Name, u.MemoryUsageAverage) + } + } + report. + GetOrCreateChart("GPU temperature, ℃", nil). + AddSeries(uuid, gi.gpu.Temperature) + report. + GetOrCreateChart("GPU power, watts", nil). + AddSeries(uuid, gi.gpu.PowerWatts) + } + + if !seenGPUs { + a.delReport(model.AuditReportGPU) + } +} diff --git a/auditor/logs.go b/auditor/logs.go index 914a06e08..94b420558 100644 --- a/auditor/logs.go +++ b/auditor/logs.go @@ -12,8 +12,8 @@ func (a *appAuditor) logs() { report.AddWidget(&model.Widget{Logs: &model.Logs{ApplicationId: a.app.Id, Check: check}, Width: "100%"}) sum := timeseries.NewAggregate(timeseries.NanSum) - for level, msgs := range a.app.LogMessages { - if !level.IsError() { + for severity, msgs := range a.app.LogMessages { + if severity < model.SeverityError { continue } sum.Add(msgs.Messages) diff --git a/auditor/memory.go b/auditor/memory.go index c7a14f6d3..5d594402e 100644 --- a/auditor/memory.go +++ b/auditor/memory.go @@ -13,7 +13,7 @@ func (a *appAuditor) memory(ncs nodeConsumersByNode) { leakCheck := report.CreateCheck(model.Checks.MemoryLeakPercent) usageChart := report.GetOrCreateChartGroup( - "Memory usage (RSS) , bytes", + "Memory usage , bytes", model.NewDocLink("inspections", "memory", "memory-usage"), ) oomChart := report.GetOrCreateChart( @@ -35,6 +35,7 @@ func (a *appAuditor) memory(ncs nodeConsumersByNode) { oom := timeseries.NewAggregate(timeseries.NanSum) instanceRss := timeseries.NewAggregate(timeseries.NanSum) instanceRssForTrend := timeseries.NewAggregate(timeseries.NanSum) + instancePageCache := timeseries.NewAggregate(timeseries.NanSum) for _, c := range i.Containers { seenContainers = true if limitByContainer[c.Name] == nil { @@ -42,11 +43,12 @@ func (a *appAuditor) memory(ncs nodeConsumersByNode) { } limitByContainer[c.Name].Add(c.MemoryLimit) if usageChart != nil { - usageChart.GetOrCreateChart("container: "+c.Name).AddSeries(i.Name, c.MemoryRss) + usageChart.GetOrCreateChart("RSS container: "+c.Name).AddSeries(i.Name, c.MemoryRss) } oom.Add(c.OOMKills) instanceRssForTrend.Add(c.MemoryRssForTrend) instanceRss.Add(c.MemoryRss) + instancePageCache.Add(c.MemoryCache) } if a.app.PeriodicJob() { leakCheck.SetStatus(model.UNKNOWN, "not checked for periodic jobs") @@ -63,8 +65,9 @@ func (a *appAuditor) memory(ncs nodeConsumersByNode) { } } - if usageChart != nil && len(usageChart.Charts) > 1 { - usageChart.GetOrCreateChart("total").AddSeries(i.Name, instanceRss).Feature() + if usageChart != nil { + usageChart.GetOrCreateChart("RSS").AddSeries(i.Name, instanceRss).Feature() + usageChart.GetOrCreateChart("RSS + PageCache").AddSeries(i.Name, timeseries.Sum(instanceRss.Get(), instancePageCache.Get())) } oomTs := oom.Get() @@ -101,7 +104,7 @@ func (a *appAuditor) memory(ncs nodeConsumersByNode) { if usageChart != nil { for container, limit := range limitByContainer { - usageChart.GetOrCreateChart("container: "+container).SetThreshold("limit", limit.Get()) + usageChart.GetOrCreateChart("RSS container: "+container).SetThreshold("limit", limit.Get()) } } diff --git a/auditor/network.go b/auditor/network.go index 6ad5dc050..873d7d365 100644 --- a/auditor/network.go +++ b/auditor/network.go @@ -56,7 +56,7 @@ func (a *appAuditor) network() { } if trafficChart != nil { trafficChart.GetOrCreateChart("inbound").Stacked().AddSeries("←"+u.RemoteApplication.Id.Name, u.BytesReceived) - trafficChart.GetOrCreateChart("outbound").Stacked().AddSeries("→"+legend, u.BytesSent) + trafficChart.GetOrCreateChart("outbound").Stacked().AddSeries("→"+u.RemoteApplication.Id.Name, u.BytesSent) } } } diff --git a/auditor/node.go b/auditor/node.go index a3b8e2a9c..7ebbb7b25 100644 --- a/auditor/node.go +++ b/auditor/node.go @@ -18,6 +18,7 @@ func AuditNode(w *model.World, node *model.Node) *model.AuditReport { report.Status = model.OK + report.AddWidget(&model.Widget{GroupHeader: "CPU", Width: "100%"}) cpuByModeChart( report.GetOrCreateChart("CPU usage, %", model.NewDocLink("inspections", "cpu", "node-cpu-usage")), node.CpuUsageByMode, @@ -34,6 +35,8 @@ func AuditNode(w *model.World, node *model.Node) *model.AuditReport { node.MemoryTotalBytes, timeseries.Sum(node.MemoryCachedBytes, node.MemoryFreeBytes), ) + report.AddWidget(&model.Widget{GroupHeader: "Memory", Width: "100%"}) + report. GetOrCreateChart("Memory usage, bytes", nil). Stacked(). @@ -48,6 +51,8 @@ func AuditNode(w *model.World, node *model.Node) *model.AuditReport { SetThreshold("total", node.MemoryTotalBytes). AddMany(ncs.memory, 5, timeseries.Max) + report.AddWidget(&model.Widget{GroupHeader: "Network", Width: "100%"}) + for _, i := range node.NetInterfaces { report. GetOrCreateChartInGroup("Network bandwidth , bits/second", i.Name, nil). @@ -86,6 +91,9 @@ func AuditNode(w *model.World, node *model.Node) *model.AuditReport { vol.PVCs.Add(v.Name.Value()) } } + if len(volumes) > 0 { + report.AddWidget(&model.Widget{GroupHeader: "Disks", Width: "100%"}) + } disks := report.GetOrCreateTable("Device", "Mount points", "Used by", "Latency", "I/O Load", "Space") ioLatencyChart := report.GetOrCreateChartGroup("Average I/O latency , seconds", nil) ioLoadChart := report.GetOrCreateChartGroup("I/O load (total latency) , seconds/second", nil) @@ -149,5 +157,53 @@ func AuditNode(w *model.World, node *model.Node) *model.AuditReport { AddSeries("used", vol.UsedBytes). SetThreshold("total", vol.CapacityBytes) } + + if len(node.GPUs) > 0 { + report.AddWidget(&model.Widget{GroupHeader: "GPUs", Width: "100%"}) + } + + for _, gpu := range node.GPUs { + gpus := report.GetOrCreateTable("GPU UUID", "Name", "vRAM") + mem := model.NewTableCell() + if last := gpu.TotalMemory.Last(); last > 0 { + mem.SetValue(humanize.Bytes(uint64(last))) + } + gpus.AddRow(model.NewTableCell(gpu.UUID), model.NewTableCell(gpu.Name.Value()), mem) + report. + GetOrCreateChartGroup("GPU utilization , %", nil). + GetOrCreateChart("average"). + AddSeries(gpu.UUID, gpu.UsageAverage).Feature() + report. + GetOrCreateChartGroup("GPU utilization , %", nil). + GetOrCreateChart("peak"). + AddSeries(gpu.UUID, gpu.UsagePeak) + report. + GetOrCreateChartGroup("GPU Memory utilization , %", nil). + GetOrCreateChart("average"). + AddSeries(gpu.UUID, gpu.MemoryUsageAverage).Feature() + report. + GetOrCreateChartGroup("GPU Memory utilization , %", nil). + GetOrCreateChart("peak"). + AddSeries(gpu.UUID, gpu.MemoryUsagePeak).Feature() + coreChart := report. + GetOrCreateChartGroup("GPU consumers , %", nil). + GetOrCreateChart(gpu.UUID).Stacked() + memChart := report. + GetOrCreateChartGroup("GPU memory consumers , %", nil). + GetOrCreateChart(gpu.UUID).Stacked() + for _, ci := range gpu.Instances { + if u := ci.GPUUsage[gpu.UUID]; u != nil { + coreChart.AddSeries(ci.Name, u.UsageAverage) + memChart.AddSeries(ci.Name, u.MemoryUsageAverage) + } + } + report. + GetOrCreateChart("GPU temperature, ℃", nil). + AddSeries(gpu.UUID, gpu.Temperature) + report. + GetOrCreateChart("GPU power, watts", nil). + AddSeries(gpu.UUID, gpu.PowerWatts) + + } return report } diff --git a/cache/updater.go b/cache/updater.go index 4de9dd971..31c223a19 100644 --- a/cache/updater.go +++ b/cache/updater.go @@ -109,7 +109,7 @@ func (c *Cache) updaterWorker(projects *sync.Map, projectId db.ProjectId, promCl if availabilityCfg.Custom { queries = append(queries, constructor.Q("", availabilityCfg.Total()), constructor.Q("", availabilityCfg.Failed())) } - latencyCfg, _ := checkConfigs.GetLatency(appId, model.CalcApplicationCategory(appId, project.Settings.ApplicationCategories)) + latencyCfg, _ := checkConfigs.GetLatency(appId, project.CalcApplicationCategory(appId)) if latencyCfg.Custom { queries = append(queries, constructor.Q("", latencyCfg.Histogram(), "le")) } @@ -211,7 +211,7 @@ func (c *Cache) download(to timeseries.Time, promClient *prom.Client, projectId } for _, i := range calcIntervals(from, step, to, jitter) { ctx, cancel := context.WithTimeout(context.Background(), queryTimeout) - vs, err := promClient.QueryRange(ctx, task.query.Query, task.query.Labels, i.chunkTs, i.toTs, step) + vs, err := promClient.QueryRange(ctx, task.query.Query, task.query.Labels.Has, i.chunkTs, i.toTs, step) cancel() if err != nil { klog.Errorln("failed to query prometheus:", err) @@ -326,7 +326,7 @@ func (c *Cache) processRecordingRules(to timeseries.Time, project *db.Project, s finalized := chunkEnd == i.toTs for name, rule := range constructor.RecordingRules { hash := queryHash(name) - mvs := rule(project, world) + mvs := rule(c.db, project, world) err = c.writeChunk(project.Id, hash, i.chunkTs, pointsCount, step, finalized, mvs) if err != nil { klog.Errorln("failed to save chunk:", err) @@ -385,7 +385,7 @@ func getScrapeInterval(promClient *prom.Client) (timeseries.Duration, error) { to := timeseries.Now() from := to.Add(-timeseries.Hour) query := fmt.Sprintf("timestamp(node_info)-%d", from) - mvs, err := promClient.QueryRange(ctx, query, nil, from, to, step) + mvs, err := promClient.QueryRange(ctx, query, prom.FilterLabelsDropAll, from, to, step) if err != nil { return step, err } diff --git a/clickhouse/clickhouse.go b/clickhouse/clickhouse.go index 95e8c635a..74bdf7643 100644 --- a/clickhouse/clickhouse.go +++ b/clickhouse/clickhouse.go @@ -85,3 +85,8 @@ func (c *Client) Query(ctx context.Context, query string, args ...interface{}) ( query = collector.ReplaceTables(query, c.useDistributedTables) return c.conn.Query(ctx, query, args...) } + +func (c *Client) QueryRow(ctx context.Context, query string, args ...interface{}) driver.Row { + query = collector.ReplaceTables(query, c.useDistributedTables) + return c.conn.QueryRow(ctx, query, args...) +} diff --git a/clickhouse/logs.go b/clickhouse/logs.go index 7e8ea6317..4d2f2f3b9 100644 --- a/clickhouse/logs.go +++ b/clickhouse/logs.go @@ -12,178 +12,281 @@ import ( "github.com/coroot/coroot/model" "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" - "golang.org/x/exp/maps" ) -func (c *Client) GetServicesFromLogs(ctx context.Context, from timeseries.Time) (map[string][]string, error) { - rows, err := c.Query(ctx, "SELECT DISTINCT ServiceName, SeverityText FROM @@table_otel_logs_service_name_severity_text@@ WHERE LastSeen >= @from", +func (c *Client) GetServicesFromLogs(ctx context.Context, from timeseries.Time) ([]string, error) { + rows, err := c.Query(ctx, "SELECT DISTINCT ServiceName FROM @@table_otel_logs_service_name_severity_text@@ WHERE LastSeen >= @from", clickhouse.DateNamed("from", from.ToStandard(), clickhouse.NanoSeconds), ) if err != nil { return nil, err } defer rows.Close() - res := map[string][]string{} - var app, sev string + var res []string + var app string for rows.Next() { - if err = rows.Scan(&app, &sev); err != nil { + if err = rows.Scan(&app); err != nil { return nil, err } - res[app] = append(res[app], sev) + res = append(res, app) } return res, nil } -func (c *Client) GetServiceLogsHistogram(ctx context.Context, from, to timeseries.Time, step timeseries.Duration, service string, severities []string, search string) (map[string]*timeseries.TimeSeries, error) { - filters, args := logFilters(from, to, []string{service}, severities, nil, search) - return c.getLogsHistogram(ctx, filters, args, from, to, step) -} - -func (c *Client) GetServiceLogs(ctx context.Context, from, to timeseries.Time, service string, severities []string, search string, limit int) ([]*model.LogEntry, error) { - filters, args := logFilters(from, to, []string{service}, severities, nil, search) - return c.getLogs(ctx, filters, args, severities, limit) -} - -func (c *Client) GetContainerLogsHistogram(ctx context.Context, from, to timeseries.Time, step timeseries.Duration, containers map[string][]string, severities []string, hashes []string, search string) (map[string]*timeseries.TimeSeries, error) { - services := maps.Keys(containers) - filters, args := logFilters(from, to, services, severities, hashes, search) - return c.getLogsHistogram(ctx, filters, args, from, to, step) -} - -func (c *Client) GetContainerLogs(ctx context.Context, from, to timeseries.Time, containers map[string][]string, severities []string, hashes []string, search string, limit int) ([]*model.LogEntry, error) { - byService := map[string][]*model.LogEntry{} - for service, ids := range containers { - filters, args := logFilters(from, to, []string{service}, nil, hashes, search) - filters = append(filters, "ResourceAttributes['container.id'] IN (@containerId)") - args = append(args, clickhouse.Named("containerId", ids)) - entries, err := c.getLogs(ctx, filters, args, severities, limit) - if err != nil { +func (c *Client) GetLogsHistogram(ctx context.Context, query LogQuery) ([]model.LogHistogramBucket, error) { + where, args := query.filters(nil) + q := fmt.Sprintf("SELECT multiIf(SeverityNumber=0, 0, intDiv(SeverityNumber, 4)+1), toStartOfInterval(Timestamp, INTERVAL %d second), count(1)", query.Ctx.Step) + q += " FROM @@table_otel_logs@@" + q += " WHERE " + strings.Join(where, " AND ") + q += " GROUP BY 1, 2" + rows, err := c.Query(ctx, q, args...) + if err != nil { + return nil, err + } + defer rows.Close() + bySeverity := map[int64]*timeseries.TimeSeries{} + var sev int64 + var t time.Time + var count uint64 + for rows.Next() { + if err = rows.Scan(&sev, &t, &count); err != nil { return nil, err } - if len(containers) == 1 { - return entries, nil + if bySeverity[sev] == nil { + bySeverity[sev] = timeseries.New(query.Ctx.From, query.Ctx.PointsCount(), query.Ctx.Step) } - byService[service] = entries - } - var res []*model.LogEntry - for _, entries := range byService { - res = append(res, entries...) + bySeverity[sev].Set(timeseries.Time(t.Unix()), float32(count)) } - sort.Slice(res, func(i, j int) bool { - return res[i].Timestamp.After(res[j].Timestamp) - }) - if len(res) > limit { - return res[:limit], nil + res := make([]model.LogHistogramBucket, 0, len(bySeverity)) + for s, ts := range bySeverity { + res = append(res, model.LogHistogramBucket{Severity: model.Severity(s), Timeseries: ts}) } + sort.Slice(res, func(i, j int) bool { return res[i].Severity < res[j].Severity }) return res, nil } -func (c *Client) getLogsHistogram(ctx context.Context, filters []string, args []any, from, to timeseries.Time, step timeseries.Duration) (map[string]*timeseries.TimeSeries, error) { - q := fmt.Sprintf("SELECT SeverityText, toStartOfInterval(Timestamp, INTERVAL %d second), count(1)", step) +func (c *Client) GetLogs(ctx context.Context, query LogQuery) ([]*model.LogEntry, error) { + where, args := query.filters(nil) + q := "SELECT ServiceName, Timestamp, multiIf(SeverityNumber=0, 0, intDiv(SeverityNumber, 4)+1), Body, TraceId, ResourceAttributes, LogAttributes" q += " FROM @@table_otel_logs@@" - q += " WHERE " + strings.Join(filters, " AND ") - q += " GROUP BY 1, 2" + q += " WHERE " + strings.Join(where, " AND ") + q += " LIMIT " + fmt.Sprint(query.Limit) + rows, err := c.Query(ctx, q, args...) if err != nil { return nil, err } defer rows.Close() - res := map[string]*timeseries.TimeSeries{} - var sev string - var ts time.Time - var count uint64 + var res []*model.LogEntry for rows.Next() { - if err = rows.Scan(&sev, &ts, &count); err != nil { + var e model.LogEntry + var sev int64 + if err = rows.Scan(&e.ServiceName, &e.Timestamp, &sev, &e.Body, &e.TraceId, &e.ResourceAttributes, &e.LogAttributes); err != nil { return nil, err } - if res[sev] == nil { - res[sev] = timeseries.New(from, int(to.Sub(from)/step), step) - } - res[sev].Set(timeseries.Time(ts.Unix()), float32(count)) + e.Severity = model.Severity(sev) + res = append(res, &e) } return res, nil } -func (c *Client) getLogs(ctx context.Context, filters []string, args []any, severities []string, limit int) ([]*model.LogEntry, error) { - if len(severities) == 0 { - return nil, nil - } - - var qs []string - for _, severity := range severities { - q := "SELECT Timestamp, SeverityText, Body, TraceId, ResourceAttributes, LogAttributes" - q += " FROM @@table_otel_logs@@" - q += " WHERE " + strings.Join(append(filters, fmt.Sprintf("SeverityText = '%s'", severity)), " AND ") - q += " ORDER BY toUnixTimestamp(Timestamp) DESC LIMIT " + fmt.Sprint(limit) - qs = append(qs, q) +func (c *Client) GetLogFilters(ctx context.Context, query LogQuery, name string) ([]string, error) { + where, args := query.filters(&name) + var q string + var res []string + switch name { + case "": + res = append(res, "Severity", "Message") + q = "SELECT DISTINCT arrayJoin(arrayConcat(mapKeys(LogAttributes), mapKeys(ResourceAttributes)))" + case "Severity": + q = "SELECT DISTINCT multiIf(SeverityNumber=0, 0, intDiv(SeverityNumber, 4)+1)" + case "Message": + return res, nil + default: + q = "SELECT DISTINCT arrayJoin([LogAttributes[@attr], ResourceAttributes[@attr]])" + args = append(args, clickhouse.Named("attr", name)) } - q := "SELECT *" - q += " FROM (" + strings.Join(qs, " UNION ALL ") + ") l" - q += " ORDER BY Timestamp DESC LIMIT " + fmt.Sprint(limit) - + q += " FROM @@table_otel_logs@@" + q += " WHERE " + strings.Join(where, " AND ") + q += " ORDER BY 1 LIMIT 1000" rows, err := c.Query(ctx, q, args...) if err != nil { return nil, err } defer rows.Close() - var res []*model.LogEntry + var s string + var i int64 for rows.Next() { - var e model.LogEntry - if err = rows.Scan(&e.Timestamp, &e.Severity, &e.Body, &e.TraceId, &e.ResourceAttributes, &e.LogAttributes); err != nil { + switch name { + case "Severity": + err = rows.Scan(&i) + s = model.Severity(i).String() + default: + err = rows.Scan(&s) + } + if err != nil { return nil, err } - res = append(res, &e) + if s == "" { + continue + } + res = append(res, s) } return res, nil } -func logFilters(from, to timeseries.Time, services []string, severities []string, hashes []string, search string) ([]string, []any) { - var filters []string - var args []any +type LogQuery struct { + Ctx timeseries.Context + Source model.LogSource + Services []string + Filters []LogFilter + Limit int +} - if len(services) == 1 { - filters = append(filters, "ServiceName = @serviceName") - args = append(args, clickhouse.Named("serviceName", services[0])) - } else { - filters = append(filters, "ServiceName IN (@serviceName)") - args = append(args, clickhouse.Named("serviceName", services)) - } +type LogFilter struct { + Name string `json:"name"` + Op string `json:"op"` + Value string `json:"value"` +} + +func (q LogQuery) filters(attr *string) ([]string, []any) { + var where []string + var args []any - if len(severities) > 0 { - filters = append(filters, "SeverityText IN (@severityText)") - args = append(args, clickhouse.Named("severityText", severities)) + switch len(q.Services) { + case 0: + switch q.Source { + case model.LogSourceAgent: + where = append(where, "startsWith(ServiceName, '/')") + case model.LogSourceOtel: + where = append(where, "NOT startsWith(ServiceName, '/')") + } + case 1: + where = append(where, "ServiceName = @serviceName") + args = append(args, clickhouse.Named("serviceName", q.Services[0])) + default: + where = append(where, "ServiceName IN (@serviceName)") + args = append(args, clickhouse.Named("serviceName", q.Services)) } - filters = append(filters, "Timestamp BETWEEN @from AND @to") + where = append(where, "Timestamp BETWEEN @from AND @to") args = append(args, - clickhouse.DateNamed("from", from.ToStandard(), clickhouse.NanoSeconds), - clickhouse.DateNamed("to", to.ToStandard(), clickhouse.NanoSeconds), + clickhouse.DateNamed("from", q.Ctx.From.ToStandard(), clickhouse.NanoSeconds), + clickhouse.DateNamed("to", q.Ctx.To.ToStandard(), clickhouse.NanoSeconds), ) - if len(hashes) > 0 { - filters = append(filters, "LogAttributes['pattern.hash'] IN (@patternHash)") - args = append(args, - clickhouse.Named("patternHash", hashes), - ) + filters := utils.Uniq(q.Filters) + var message []string + byName := map[string][]LogFilter{} + for _, f := range filters { + if attr == nil && f.Name == "Message" { + fields := strings.FieldsFunc(f.Value, func(r rune) bool { + return unicode.IsSpace(r) || (r <= unicode.MaxASCII && !unicode.IsNumber(r) && !unicode.IsLetter(r)) + }) + message = append(message, fields...) + continue + } + if attr != nil && f.Name == *attr { + continue + } + byName[f.Name] = append(byName[f.Name], f) } - if len(search) > 0 { - fields := strings.FieldsFunc(search, func(r rune) bool { - return unicode.IsSpace(r) || (r <= unicode.MaxASCII && !unicode.IsNumber(r) && !unicode.IsLetter(r)) - }) - if len(fields) > 0 { - var ands []string - for i, f := range fields { - set := utils.NewStringSet(f, strings.ToLower(f), strings.ToUpper(f), strings.Title(f)) - var ors []string - for j, s := range set.Items() { - name := fmt.Sprintf("token_%d_%d", i, j) - ors = append(ors, fmt.Sprintf("hasToken(Body, @%s)", name)) - args = append(args, clickhouse.Named(name, s)) + + i := 0 + for name, attrs := range byName { + var ors, ands []string + switch name { + case "Severity": + for j, a := range attrs { + r1, r2 := model.SeverityFromString(a.Value).Range() + var f *[]string + var expr string + switch a.Op { + case "=": + expr = "SeverityNumber BETWEEN @%[1]s AND @%[2]s" + f = &ors + case "!=": + expr = "SeverityNumber NOT BETWEEN @%[1]s AND @%[2]s" + f = &ands + default: + continue + } + v1 := fmt.Sprintf("severity_from_%d", j) + v2 := fmt.Sprintf("severity_to_%d", j) + *f = append(*f, fmt.Sprintf(expr, v1, v2)) + args = append(args, clickhouse.Named(v1, r1)) + args = append(args, clickhouse.Named(v2, r2)) + } + case "TraceId": + for j, a := range attrs { + var f *[]string + var expr string + switch a.Op { + case "=": + expr = "TraceId = @%[1]s" + f = &ors + default: + continue } + v := fmt.Sprintf("trace_id_%d", j) + *f = append(*f, fmt.Sprintf(expr, v)) + args = append(args, clickhouse.Named(v, a.Value)) + } + default: + for j, a := range attrs { + var f *[]string + var expr string + switch a.Op { + case "=": + expr = "(LogAttributes[@%[1]s] = @%[2]s OR ResourceAttributes[@%[1]s] = @%[2]s)" + f = &ors + case "!=": + expr = "(LogAttributes[@%[1]s] != @%[2]s AND ResourceAttributes[@%[1]s] != @%[2]s)" + f = &ands + case "~": + expr = "(match(LogAttributes[@%[1]s], @%[2]s) OR match(ResourceAttributes[@%[1]s], @%[2]s))" + f = &ors + case "!~": + expr = "(NOT match(LogAttributes[@%[1]s], @%[2]s) AND NOT match(ResourceAttributes[@%[1]s], @%[2]s))" + f = &ands + default: + continue + } + n := fmt.Sprintf("attr_name_%d_%d", i, j) + v := fmt.Sprintf("attr_values_%d_%d", i, j) + *f = append(*f, fmt.Sprintf(expr, n, v)) + args = append(args, clickhouse.Named(n, name)) + args = append(args, clickhouse.Named(v, a.Value)) + } + } + if len(ands) > 0 { + where = append(where, "("+strings.Join(ands, " AND ")+")") + } + if len(ors) > 0 { + where = append(where, "("+strings.Join(ors, " OR ")+")") + } + i++ + } + + if len(message) > 0 { + message = utils.Uniq(message) + var ands []string + for i, m := range message { + set := utils.NewStringSet(m, strings.ToLower(m), strings.ToUpper(m), strings.Title(m)) + var ors []string + for j, s := range set.Items() { + name := fmt.Sprintf("token_%d_%d", i, j) + ors = append(ors, fmt.Sprintf("hasToken(Body, @%s)", name)) + args = append(args, clickhouse.Named(name, s)) + } + if len(ors) > 0 { ands = append(ands, fmt.Sprintf("(%s)", strings.Join(ors, " OR "))) } - filters = append(filters, strings.Join(ands, " AND ")) + } + if len(ands) > 0 { + where = append(where, strings.Join(ands, " AND ")) } } - return filters, args + + return where, args } diff --git a/clickhouse/traces.go b/clickhouse/traces.go index a3e0f7473..bd6caa3fa 100644 --- a/clickhouse/traces.go +++ b/clickhouse/traces.go @@ -54,7 +54,7 @@ func (c *Client) GetRootSpansHistogram(ctx context.Context, q SpanQuery) ([]mode func (c *Client) GetRootSpans(ctx context.Context, q SpanQuery) ([]*model.TraceSpan, error) { filter, filterArgs := q.RootSpansFilter() - return c.getSpans(ctx, q, "", "Timestamp DESC", filter, filterArgs) + return c.getSpans(ctx, q, "", "", filter, filterArgs) } func (c *Client) GetRootSpansSummary(ctx context.Context, q SpanQuery) (*model.TraceSpanSummary, error) { @@ -102,7 +102,7 @@ func (c *Client) GetSpansByServiceNameHistogram(ctx context.Context, q SpanQuery func (c *Client) GetSpansByServiceName(ctx context.Context, q SpanQuery) ([]*model.TraceSpan, error) { filter, filterArgs := q.SpansByServiceNameFilter() - return c.getSpans(ctx, q, "", "Timestamp DESC", filter, filterArgs) + return c.getSpans(ctx, q, "", "", filter, filterArgs) } func (c *Client) GetInboundSpansHistogram(ctx context.Context, q SpanQuery, clients []string, listens []model.Listen) ([]model.HistogramBucket, error) { @@ -118,7 +118,7 @@ func (c *Client) GetInboundSpans(ctx context.Context, q SpanQuery, clients []str return nil, nil } filter, filterArgs := inboundSpansFilter(clients, listens) - return c.getSpans(ctx, q, "", "Timestamp DESC", filter, filterArgs) + return c.getSpans(ctx, q, "", "", filter, filterArgs) } func (c *Client) GetParentSpans(ctx context.Context, spans []*model.TraceSpan) ([]*model.TraceSpan, error) { @@ -375,14 +375,26 @@ func (c *Client) getSpans(ctx context.Context, q SpanQuery, with string, orderBy func (c *Client) getTraces(ctx context.Context, filters []string, filterArgs []any) ([]*model.Trace, error) { query := fmt.Sprintf(` -WITH ( - SELECT min(Timestamp) AS start, max(Timestamp)+1 AS end, groupArray(distinct TraceId) AS ids - FROM (SELECT TraceId, Timestamp FROM @@table_otel_traces@@ WHERE %s ORDER BY Timestamp LIMIT 1000) -) AS t +SELECT min(Timestamp) AS start, max(Timestamp)+1 AS end, groupArray(distinct TraceId) AS ids +FROM (SELECT TraceId, Timestamp FROM @@table_otel_traces@@ WHERE %s ORDER BY Timestamp LIMIT 1000)`, + strings.Join(filters, " AND ")) + var minTs, maxTs time.Time + var traceIds []string + if err := c.QueryRow(ctx, query, filterArgs...).Scan(&minTs, &maxTs, &traceIds); err != nil { + return nil, err + } + if len(traceIds) == 0 { + return nil, nil + } + query = ` SELECT Timestamp, TraceId, SpanId, ParentSpanId, SpanName, ServiceName, Duration, StatusCode, StatusMessage, ResourceAttributes, SpanAttributes, Events.Timestamp, Events.Name, Events.Attributes FROM @@table_otel_traces@@ -WHERE Timestamp BETWEEN t.start AND t.end AND has(coalesce(t.ids, []), TraceId)`, strings.Join(filters, " AND ")) - rows, err := c.Query(ctx, query, filterArgs...) +WHERE Timestamp BETWEEN @from AND @to AND TraceId IN @traceIds` + rows, err := c.Query(ctx, query, + clickhouse.DateNamed("from", minTs, clickhouse.NanoSeconds), + clickhouse.DateNamed("to", maxTs, clickhouse.NanoSeconds), + clickhouse.Named("traceIds", traceIds), + ) if err != nil { return nil, err } diff --git a/cloud-pricing/manager.go b/cloud-pricing/manager.go index c275c3be3..d35108ecf 100644 --- a/cloud-pricing/manager.go +++ b/cloud-pricing/manager.go @@ -14,6 +14,7 @@ import ( "sync" "time" + "github.com/coroot/coroot/db" "github.com/coroot/coroot/model" "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" @@ -25,6 +26,7 @@ const ( dumpFileName = "cloud-pricing.json.gz" dumpTimeout = time.Second * 30 updateInterval = time.Hour * 24 + gb = 1e9 ) type Manager struct { @@ -61,7 +63,7 @@ func NewManager(dataDir string) (*Manager, error) { return m, nil } -func (mgr *Manager) GetNodePrice(node *model.Node) *model.NodePrice { +func (mgr *Manager) GetNodePrice(settings *db.CustomCloudPricing, node *model.Node) *model.NodePrice { mgr.lock.Lock() defer mgr.lock.Unlock() if mgr.model == nil { @@ -69,6 +71,11 @@ func (mgr *Manager) GetNodePrice(node *model.Node) *model.NodePrice { } var pricing *CloudPricing var price float32 + cpuCores := node.CpuCapacity.Reduce(timeseries.Max) + memBytes := node.MemoryTotalBytes.Reduce(timeseries.Max) + if timeseries.IsNaN(cpuCores) || timeseries.IsNaN(memBytes) { + return nil + } switch strings.ToLower(node.CloudProvider.Value()) { case "aws": pricing = mgr.model.AWS @@ -77,6 +84,14 @@ func (mgr *Manager) GetNodePrice(node *model.Node) *model.NodePrice { case "azure": pricing = mgr.model.Azure default: + if settings != nil { + return &model.NodePrice{ + Total: cpuCores*settings.PerCPUCore/float32(timeseries.Hour) + memBytes*settings.PerMemoryGb/gb/float32(timeseries.Hour), + PerCPUCore: settings.PerCPUCore / float32(timeseries.Hour), + PerMemoryByte: settings.PerMemoryGb / gb / float32(timeseries.Hour), + Custom: true, + } + } return nil } region := Region(strings.ToLower(node.Region.Value())) @@ -136,13 +151,10 @@ func (mgr *Manager) GetNodePrice(node *model.Node) *model.NodePrice { return nil } price /= float32(timeseries.Hour) - cpuCores := node.CpuCapacity.Last() - memBytes := node.MemoryTotalBytes.Last() np := &model.NodePrice{Total: price} if timeseries.IsNaN(cpuCores) || timeseries.IsNaN(memBytes) { return np } - const gb = 1e9 perUnit := price / (cpuCores + memBytes/gb) // assume that 1Gb of memory costs the same as 1 vCPU np.PerCPUCore = perUnit np.PerMemoryByte = perUnit / gb diff --git a/collector/collector.go b/collector/collector.go index fb78c9a26..cd5919a68 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "strings" "sync" "time" @@ -103,15 +104,9 @@ func (c *Collector) getProject(apiKey string) (*db.Project, error) { c.projectsLock.RLock() defer c.projectsLock.RUnlock() - if apiKey == "" { - if len(c.projects) == 1 { - return maps.Values(c.projects)[0], nil - } - for _, p := range c.projects { - if p.Name == "default" { - return p, nil - } - } + isEmptyKey := apiKey == "" + if isEmptyKey { + apiKey = strings.Repeat("0", 32) } for _, p := range c.projects { @@ -121,7 +116,29 @@ func (c *Collector) getProject(apiKey string) (*db.Project, error) { } } } - + if isEmptyKey { + var project *db.Project + if len(c.projects) == 1 { + project = maps.Values(c.projects)[0] + } else { + for _, p := range c.projects { + if p.Name == "default" { + project = p + break + } + } + } + if project != nil { + project.Settings.ApiKeys = append(project.Settings.ApiKeys, db.ApiKey{ + Key: apiKey, + Description: "Default project access (no API key required)", + }) + if err := c.db.SaveProjectSettings(project); err != nil { + return nil, err + } + return project, nil + } + } return nil, ErrProjectNotFound } diff --git a/collector/metrics.go b/collector/metrics.go index fcc4f08b7..69fa3c17f 100644 --- a/collector/metrics.go +++ b/collector/metrics.go @@ -1,6 +1,7 @@ package collector import ( + "bufio" "bytes" "crypto/tls" "errors" @@ -118,7 +119,6 @@ func (c *Collector) Metrics(w http.ResponseWriter, r *http.Request) { req.Header.Add(k, v) } } - httpClient := secureClient if cfg.TlsSkipVerify { httpClient = insecureClient @@ -129,15 +129,27 @@ func (c *Collector) Metrics(w http.ResponseWriter, r *http.Request) { http.Error(w, "", http.StatusInternalServerError) return } + defer func() { + io.Copy(io.Discard, res.Body) + res.Body.Close() + }() for k, vs := range res.Header { for _, v := range vs { w.Header().Add(k, v) } } - if res.StatusCode >= 400 { + if res.StatusCode == http.StatusBadRequest { + scanner := bufio.NewScanner(io.LimitReader(res.Body, 1024)) + line := "" + if scanner.Scan() { + line = scanner.Text() + } + klog.Errorf("failed to write: got %d (%s) from prometheus, responding to the agent with 200 (to prevent retry)", res.StatusCode, line) + w.WriteHeader(http.StatusOK) + return + } else if res.StatusCode > 400 { klog.Errorf("failed to write: got %d from prometheus", res.StatusCode) } w.WriteHeader(res.StatusCode) - _, _ = io.Copy(w, r.Body) - _ = res.Body.Close() + _, _ = io.Copy(w, res.Body) } diff --git a/config/bootstrap.go b/config/bootstrap.go index a5661f2e7..2ecf966b6 100644 --- a/config/bootstrap.go +++ b/config/bootstrap.go @@ -2,6 +2,7 @@ package config import ( "github.com/coroot/coroot/db" + "github.com/coroot/coroot/model" "k8s.io/klog" ) @@ -38,7 +39,8 @@ func (cfg *Config) Bootstrap(database *db.DB) error { byName := map[string]*db.Project{} for _, p := range ps { byName[p.Name] = p - p.Settings.Configurable = true + p.Settings.Readonly = false + p.Settings.Integrations.NotificationIntegrations.Readonly = false } for _, p := range cfg.Projects { pp := byName[p.Name] @@ -51,8 +53,24 @@ func (cfg *Config) Bootstrap(database *db.DB) error { } byName[pp.Name] = pp } + pp.Settings.Readonly = true pp.Settings.ApiKeys = p.ApiKeys - pp.Settings.Configurable = false + if p.NotificationIntegrations != nil { + pp.Settings.Integrations.NotificationIntegrations = *p.NotificationIntegrations + } + pp.Settings.Integrations.NotificationIntegrations.Readonly = p.NotificationIntegrations != nil + if len(p.ApplicationCategories) > 0 { + pp.Settings.ApplicationCategorySettings = map[model.ApplicationCategory]*db.ApplicationCategorySettings{} + for _, c := range p.ApplicationCategories { + pp.Settings.ApplicationCategorySettings[c.Name] = &c.ApplicationCategorySettings + } + } + if len(p.CustomApplications) > 0 { + pp.Settings.CustomApplications = map[string]model.CustomApplication{} + for _, c := range p.CustomApplications { + pp.Settings.CustomApplications[c.Name] = c.CustomApplication + } + } } for _, p := range byName { if p.Settings.ApiKeys == nil { diff --git a/config/config.go b/config/config.go index 445a94db3..1a657de75 100644 --- a/config/config.go +++ b/config/config.go @@ -12,14 +12,12 @@ import ( "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" "gopkg.in/yaml.v3" - "k8s.io/klog" ) type Config struct { ListenAddress string `yaml:"listen_address"` UrlBasePath string `yaml:"url_base_path"` DataDir string `yaml:"data_dir"` - LicenseKey string `yaml:"license_key"` Cache Cache `yaml:"cache"` Traces Traces `yaml:"traces"` @@ -143,13 +141,8 @@ type Auth struct { BootstrapAdminPassword string `yaml:"bootstrap_admin_password"` } -type Project struct { - Name string `yaml:"name"` - ApiKeys []db.ApiKey `yaml:"api_keys"` -} - -func Load() *Config { - cfg := &Config{ +func NewConfig() *Config { + return &Config{ ListenAddress: ":8080", UrlBasePath: "/", DataDir: "./data", @@ -173,31 +166,46 @@ func Load() *Config { BootstrapAdminPassword: db.AdminUserDefaultPassword, }, } - err := cfg.load() +} + +func Load() (*Config, error) { + cfg := NewConfig() + data, err := ReadFromFile() if err != nil { - klog.Exitln(err) + return nil, err } - return cfg -} -func (cfg *Config) load() error { - if *configFile != "" { - f, err := os.Open(*configFile) - if err != nil { - return err - } - defer f.Close() - data, err := io.ReadAll(f) - if err != nil { - return err - } + if len(data) > 0 { if err = yaml.Unmarshal(data, cfg); err != nil { - return err + return nil, err } } - cfg.applyFlags() + cfg.ApplyFlags() + + if err = cfg.Validate(); err != nil { + return nil, err + } + return cfg, nil +} + +func ReadFromFile() ([]byte, error) { + if *configFile == "" { + return nil, nil + } + f, err := os.Open(*configFile) + if err != nil { + return nil, err + } + defer f.Close() + data, err := io.ReadAll(f) + if err != nil { + return nil, err + } + return data, nil +} +func (cfg *Config) Validate() error { var err error cfg.UrlBasePath, err = url.JoinPath("/", cfg.UrlBasePath, "/") if err != nil { @@ -205,16 +213,8 @@ func (cfg *Config) load() error { } for i, p := range cfg.Projects { - if p.Name == "" { - return fmt.Errorf("invalid project #%d: name is required", i) - } - if len(p.ApiKeys) == 0 { - return fmt.Errorf("invalid project '%s': no api_keys defined", p.Name) - } - for ik, k := range p.ApiKeys { - if k.Key == "" { - return fmt.Errorf("invalid api_key #%d for project '%s': key is required", ik, p.Name) - } + if err = p.Validate(); err != nil { + return fmt.Errorf("invalid project #%d: %w", i, err) } } diff --git a/config/flags.go b/config/flags.go index c19ec900a..79555a410 100644 --- a/config/flags.go +++ b/config/flags.go @@ -24,7 +24,6 @@ var ( authAnonymousRole = kingpin.Flag("auth-anonymous-role", "Disable authentication and assign one of the following roles to the anonymous user: Admin, Editor, or Viewer.").Envar("AUTH_ANONYMOUS_ROLE").String() authBootstrapAdminPassword = kingpin.Flag("auth-bootstrap-admin-password", "Password for the default Admin user").Envar("AUTH_BOOTSTRAP_ADMIN_PASSWORD").String() developerMode = kingpin.Flag("developer-mode", "If enabled, Coroot will not use embedded static assets").Envar("DEVELOPER_MODE").Bool() - licenseKey = kingpin.Flag("license-key", "License key for Coroot Enterprise Edition.").Envar("LICENSE_KEY").String() globalClickhouseAddress = kingpin.Flag("global-clickhouse-address", "").Envar("GLOBAL_CLICKHOUSE_ADDRESS").String() globalClickhouseUser = kingpin.Flag("global-clickhouse-user", "").Envar("GLOBAL_CLICKHOUSE_USER").String() @@ -52,7 +51,7 @@ var ( bootstrapClickhouseDatabase = kingpin.Flag("bootstrap-clickhouse-database", "").Envar("BOOTSTRAP_CLICKHOUSE_DATABASE").String() ) -func (cfg *Config) applyFlags() { +func (cfg *Config) ApplyFlags() { if *listen != "" { cfg.ListenAddress = *listen } @@ -101,9 +100,6 @@ func (cfg *Config) applyFlags() { if *developerMode { cfg.DeveloperMode = *developerMode } - if *licenseKey != "" { - cfg.LicenseKey = *licenseKey - } keep := cfg.GlobalClickhouse != nil || *globalClickhouseAddress != "" if cfg.GlobalClickhouse == nil { diff --git a/config/project.go b/config/project.go new file mode 100644 index 000000000..5503a18fe --- /dev/null +++ b/config/project.go @@ -0,0 +1,81 @@ +package config + +import ( + "fmt" + + "github.com/coroot/coroot/db" + "github.com/coroot/coroot/model" +) + +type Project struct { + Name string `yaml:"name"` + + ApiKeys []db.ApiKey `yaml:"apiKeys"` + ApiKeysSnake []db.ApiKey `yaml:"api_keys"` // TODO: remove + + NotificationIntegrations *db.NotificationIntegrations `yaml:"notificationIntegrations"` + ApplicationCategories []ApplicationCategory `yaml:"applicationCategories"` + CustomApplications []CustomApplication `yaml:"customApplications"` +} + +func (p *Project) Validate() error { + if p.Name == "" { + return fmt.Errorf("name is required") + } + + if len(p.ApiKeys) == 0 { + p.ApiKeys = p.ApiKeysSnake + } + if len(p.ApiKeys) == 0 { + return fmt.Errorf("no api keys defined") + } + for i, k := range p.ApiKeys { + if err := k.Validate(); err != nil { + return fmt.Errorf("invalid api key #%d: %w", i, err) + } + } + + if p.NotificationIntegrations != nil { + if err := p.NotificationIntegrations.Validate(); err != nil { + return fmt.Errorf("invalid notification integrations: %w", err) + } + } + + for i, c := range p.ApplicationCategories { + if err := c.Validate(); err != nil { + return fmt.Errorf("invalid application category #%d: %w", i, err) + } + } + + for i, c := range p.CustomApplications { + if err := c.Validate(); err != nil { + return fmt.Errorf("invalid custom application #%d: %w", i, err) + } + } + + return nil +} + +type ApplicationCategory struct { + Name model.ApplicationCategory `yaml:"name"` + db.ApplicationCategorySettings `yaml:",inline"` +} + +func (c *ApplicationCategory) Validate() error { + if c.Name == "" { + return fmt.Errorf("name is required") + } + return nil +} + +type CustomApplication struct { + Name string `yaml:"name"` + model.CustomApplication `yaml:",inline"` +} + +func (c *CustomApplication) Validate() error { + if c.Name == "" { + return fmt.Errorf("name is required") + } + return nil +} diff --git a/constructor/connections.go b/constructor/connections.go index c080fac48..f931db561 100644 --- a/constructor/connections.go +++ b/constructor/connections.go @@ -1,16 +1,18 @@ package constructor import ( + "net" "strings" "github.com/coroot/coroot/model" "github.com/coroot/coroot/timeseries" + "github.com/coroot/coroot/utils" "k8s.io/klog" ) -func (c *Constructor) loadAppToAppConnections(w *model.World, metrics map[string][]*model.MetricValues) { +func (c *Constructor) loadAppToAppConnections(w *model.World, metrics map[string][]*model.MetricValues, fqdn2ip map[string]*utils.StringSet) { for queryName := range metrics { - if !strings.HasPrefix(queryName, "rr_connection") { + if !strings.HasPrefix(queryName, "rr_connection") || strings.HasSuffix(queryName, "_raw") { continue } for _, mv := range metrics[queryName] { @@ -28,6 +30,16 @@ func (c *Constructor) loadAppToAppConnections(w *model.World, metrics map[string conn := app.Upstreams[destId] if conn == nil { dest := w.GetOrCreateApplication(destId, false) + if destId.Kind == model.ApplicationKindExternalService { + if fqdn, port, _ := net.SplitHostPort(destId.Name); fqdn != "" && port != "" { + if ips := fqdn2ip[fqdn]; ips != nil { + for _, ip := range ips.Items() { + instance := dest.GetOrCreateInstance(ip, nil) + instance.TcpListens[model.Listen{IP: ip, Port: port}] = true + } + } + } + } conn = &model.AppToAppConnection{ Application: app, RemoteApplication: dest, diff --git a/constructor/constructor.go b/constructor/constructor.go index 3961502e8..6078c9548 100644 --- a/constructor/constructor.go +++ b/constructor/constructor.go @@ -13,7 +13,6 @@ import ( "github.com/coroot/coroot/model" "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" - "golang.org/x/exp/maps" "k8s.io/klog" ) @@ -57,7 +56,12 @@ func (c *Constructor) LoadWorld(ctx context.Context, from, to timeseries.Time, s } w := model.NewWorld(from, to, step, rawStep) w.CustomApplications = c.project.Settings.CustomApplications - w.Categories = maps.Keys(c.project.Settings.ApplicationCategories) + for name := range c.project.Settings.ApplicationCategorySettings { + if !name.Default() { + w.Categories = append(w.Categories, name) + } + } + utils.SortSlice(w.Categories) if prof == nil { prof = &Profile{} @@ -84,12 +88,13 @@ func (c *Constructor) LoadWorld(ctx context.Context, from, to timeseries.Time, s ecInstancesById := map[string]*model.Instance{} servicesByClusterIP := map[string]*model.Service{} ip2fqdn := map[string]*utils.StringSet{} + fqdn2ip := map[string]*utils.StringSet{} containers := containerCache{} // order is important prof.stage("load_job_statuses", func() { loadPromJobStatuses(metrics, pjs) }) prof.stage("load_nodes", func() { c.loadNodes(w, metrics, nodes) }) - prof.stage("load_fqdn", func() { loadFQDNs(metrics, ip2fqdn) }) + prof.stage("load_fqdn", func() { loadFQDNs(metrics, ip2fqdn, fqdn2ip) }) prof.stage("load_fargate_nodes", func() { c.loadFargateNodes(metrics, nodes) }) prof.stage("load_k8s_metadata", func() { loadKubernetesMetadata(w, metrics, servicesByClusterIP) }) prof.stage("load_aws_status", func() { loadAWSStatus(w, metrics) }) @@ -99,14 +104,15 @@ func (c *Constructor) LoadWorld(ctx context.Context, from, to timeseries.Time, s prof.stage("load_elasticache", func() { c.loadElasticache(w, metrics, pjs, ecInstancesById) }) prof.stage("load_fargate_containers", func() { loadFargateContainers(w, metrics, pjs) }) prof.stage("load_containers", func() { c.loadContainers(w, metrics, pjs, nodes, containers, servicesByClusterIP, ip2fqdn) }) - prof.stage("load_app_to_app_connections", func() { c.loadAppToAppConnections(w, metrics) }) + prof.stage("load_app_to_app_connections", func() { c.loadAppToAppConnections(w, metrics, fqdn2ip) }) prof.stage("load_application_traffic", func() { c.loadApplicationTraffic(w, metrics) }) prof.stage("load_jvm", func() { c.loadJVM(metrics, containers) }) prof.stage("load_dotnet", func() { c.loadDotNet(metrics, containers) }) prof.stage("load_python", func() { c.loadPython(metrics, containers) }) prof.stage("enrich_instances", func() { enrichInstances(w, metrics, rdsInstancesById, ecInstancesById) }) - prof.stage("join_db_cluster", func() { joinDBClusterComponents(w) }) prof.stage("calc_app_categories", func() { c.calcApplicationCategories(w) }) + prof.stage("group_custom_applications", func() { c.groupCustomApplications(w) }) + prof.stage("join_db_cluster_components", func() { c.joinDBClusterComponents(w) }) prof.stage("load_app_settings", func() { c.loadApplicationSettings(w) }) prof.stage("load_app_sli", func() { c.loadSLIs(w, metrics) }) prof.stage("load_container_logs", func() { c.loadContainerLogs(metrics, containers, pjs) }) @@ -189,7 +195,7 @@ func (c *Constructor) queryCache(ctx context.Context, from, to timeseries.Time, addQuery(qName+"total_requests", qApplicationCustomSLI, availabilityCfg.Total(), true) addQuery(qName+"failed_requests", qApplicationCustomSLI, availabilityCfg.Failed(), true) } - latencyCfg, _ := checkConfigs.GetLatency(appId, model.CalcApplicationCategory(appId, c.project.Settings.ApplicationCategories)) + latencyCfg, _ := checkConfigs.GetLatency(appId, c.project.CalcApplicationCategory(appId)) if latencyCfg.Custom { addQuery(qName+"requests_histogram", qApplicationCustomSLI, latencyCfg.Histogram(), true) } @@ -234,7 +240,11 @@ func (c *Constructor) queryCache(ctx context.Context, from, to timeseries.Time, func (c *Constructor) calcApplicationCategories(w *model.World) { for _, app := range w.Applications { - app.Category = model.CalcApplicationCategory(app.Id, c.project.Settings.ApplicationCategories) + if annotation := app.GetAnnotation(model.ApplicationAnnotationCategory); annotation != "" { + app.Category = model.ApplicationCategory(annotation) + continue + } + app.Category = c.project.CalcApplicationCategory(app.Id) } } @@ -385,52 +395,80 @@ func enrichInstances(w *model.World, metrics map[string][]*model.MetricValues, r } } -func joinDBClusterComponents(w *model.World) { - clusters := map[model.ApplicationId]*model.Application{} - toDelete := map[model.ApplicationId]*model.Application{} - for _, app := range w.Applications { - for _, instance := range app.Instances { - if instance.ClusterName.Value() == "" { - continue - } - id := model.NewApplicationId(app.Id.Namespace, model.ApplicationKindDatabaseCluster, instance.ClusterName.Value()) - cluster := clusters[id] - if cluster == nil { - cluster = w.GetOrCreateApplication(id, false) - clusters[id] = cluster - } - toDelete[app.Id] = cluster - } - } - if len(toDelete) > 0 { - for id, app := range w.Applications { - cluster := toDelete[app.Id] - if cluster == nil { - continue - } +type appGroup struct { + app *model.Application + members map[model.ApplicationId]*model.Application +} + +func (c *Constructor) groupApplications(w *model.World, groups map[model.ApplicationId]*appGroup) { + for _, group := range groups { + categories := utils.NewStringSet() + for _, app := range group.members { for _, svc := range app.KubernetesServices { found := false - for _, existingSvc := range cluster.KubernetesServices { + for _, existingSvc := range group.app.KubernetesServices { if svc.Name == existingSvc.Name && svc.Namespace == existingSvc.Namespace { found = true break } } if !found { - cluster.KubernetesServices = append(cluster.KubernetesServices, svc) + group.app.KubernetesServices = append(group.app.KubernetesServices, svc) } - svc.DestinationApps[cluster.Id] = cluster - delete(svc.DestinationApps, id) + svc.DestinationApps[group.app.Id] = group.app + delete(svc.DestinationApps, app.Id) } - cluster.DesiredInstances = merge(cluster.DesiredInstances, app.DesiredInstances, timeseries.NanSum) + group.app.DesiredInstances = merge(group.app.DesiredInstances, app.DesiredInstances, timeseries.NanSum) for _, instance := range app.Instances { - instance.Owner = cluster + instance.Owner = group.app instance.ClusterComponent = app + group.app.Instances = append(group.app.Instances, instance) + } + categories.Add(string(app.Category)) + delete(w.Applications, app.Id) + } + group.app.Category = model.ApplicationCategory(categories.GetFirst()) + if group.app.Category == "" { + group.app.Category = c.project.CalcApplicationCategory(group.app.Id) + } + } +} + +func (c *Constructor) groupCustomApplications(w *model.World) { + customApps := map[model.ApplicationId]*appGroup{} + for _, app := range w.Applications { + customName := app.GetAnnotation(model.ApplicationAnnotationCustomName) + if customName == "" { + continue + } + id := model.NewApplicationId(app.Id.Namespace, model.ApplicationKindCustomApplication, customName) + group := customApps[id] + if group == nil { + group = &appGroup{app: w.GetOrCreateApplication(id, true), members: map[model.ApplicationId]*model.Application{}} + customApps[id] = group + } + group.members[app.Id] = app + } + c.groupApplications(w, customApps) +} + +func (c *Constructor) joinDBClusterComponents(w *model.World) { + dbClusters := map[model.ApplicationId]*appGroup{} + for _, app := range w.Applications { + for _, instance := range app.Instances { + if instance.ClusterName.Value() == "" { + continue + } + id := model.NewApplicationId(app.Id.Namespace, model.ApplicationKindDatabaseCluster, instance.ClusterName.Value()) + cluster := dbClusters[id] + if cluster == nil { + cluster = &appGroup{app: w.GetOrCreateApplication(id, false), members: map[model.ApplicationId]*model.Application{}} + dbClusters[id] = cluster } - cluster.Instances = append(cluster.Instances, app.Instances...) - delete(w.Applications, id) + cluster.members[app.Id] = app } } + c.groupApplications(w, dbClusters) } func guessPod(ls model.Labels) string { diff --git a/constructor/containers.go b/constructor/containers.go index dc0e7ec88..5a4a71fe4 100644 --- a/constructor/containers.go +++ b/constructor/containers.go @@ -9,6 +9,7 @@ import ( "github.com/coroot/coroot/model" "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" + "inet.af/netaddr" "k8s.io/klog" ) @@ -18,6 +19,34 @@ type instanceId struct { node model.NodeId } +type nsName struct { + ns string + name string +} + +func (c *Constructor) getInstanceByAppId(apps map[nsName]*model.Application, appId string) (*model.Instance, *model.Container) { + if !strings.HasPrefix(appId, "/") { + klog.Warningln("invalid app id:", appId) + return nil, nil + } + parts := strings.Split(appId, "/") + if len(parts) != 4 { + klog.Warningln("invalid app id:", appId) + return nil, nil + } + switch parts[1] { + case "k8s", "k8s-cronjob": + default: + klog.Warningln("unknown app:", appId) + return nil, nil + } + app := apps[nsName{parts[2], parts[3]}] + if app == nil || len(app.Instances) == 0 { + return nil, nil + } + return app.Instances[0], nil +} + func (c *Constructor) getInstanceAndContainer(w *model.World, node *model.Node, instances map[instanceId]*model.Instance, containerId string) (*model.Instance, *model.Container) { var nodeId model.NodeId var nodeName string @@ -100,31 +129,47 @@ type containerCache map[model.NodeContainerId]struct { func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model.MetricValues, pjs promJobStatuses, nodes nodeCache, containers containerCache, servicesByClusterIP map[string]*model.Service, ip2fqdn map[string]*utils.StringSet) { instances := map[instanceId]*model.Instance{} + apps := map[nsName]*model.Application{} + rttByInstance := map[instanceId]map[string]*timeseries.TimeSeries{} + instancesByListen := map[model.Listen]*model.Instance{} + for _, a := range w.Applications { + if a.Id.Namespace != "" { + apps[nsName{ns: a.Id.Namespace, name: a.Id.Name}] = a + } for _, i := range a.Instances { var nodeId model.NodeId if i.Node != nil { nodeId = i.Node.Id } instances[instanceId{ns: a.Id.Namespace, name: i.Name, node: nodeId}] = i + for l := range i.TcpListens { + instancesByListen[l] = i // POD_IP:0 + } } } - rttByInstance := map[instanceId]map[string]*timeseries.TimeSeries{} - loadContainer := func(queryName string, f func(instance *model.Instance, container *model.Container, metric *model.MetricValues)) { ms := metrics[queryName] for _, m := range ms { - v, ok := containers[m.NodeContainerId] - if !ok { - nodeId := model.NewNodeIdFromLabels(m) - v.instance, v.container = c.getInstanceAndContainer(w, nodes[nodeId], instances, m.ContainerId) - containers[m.NodeContainerId] = v - } - if v.instance == nil || v.container == nil { - continue + appId := m.Labels["app_id"] + if appId != "" { + instance, container := c.getInstanceByAppId(apps, appId) + if instance != nil { + f(instance, container, m) + } + } else { + v, ok := containers[m.NodeContainerId] + if !ok { + nodeId := model.NewNodeIdFromLabels(m) + v.instance, v.container = c.getInstanceAndContainer(w, nodes[nodeId], instances, m.ContainerId) + containers[m.NodeContainerId] = v + } + if v.instance == nil || v.container == nil { + continue + } + f(v.instance, v.container, m) } - f(v.instance, v.container, m) } } @@ -170,17 +215,6 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model loadContainer("container_restarts", func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { container.Restarts = merge(container.Restarts, timeseries.Increase(metric.Values, pjs.get(metric.Labels)), timeseries.Any) }) - loadContainer(qRecordingRuleInstanceL7Requests, func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { - if metric.Labels["failed"] != "" { - instance.Requests.Failed = merge(instance.Requests.Failed, metric.Values, timeseries.Any) - } else { - instance.Requests.Ok = merge(instance.Requests.Ok, metric.Values, timeseries.Any) - } - }) - loadContainer(qRecordingRuleInstanceL7Latency, func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { - instance.Requests.TotalLatency = merge(instance.Requests.TotalLatency, metric.Values, timeseries.Any) - }) - loadContainer("container_net_latency", func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { id := instanceId{ns: instance.Owner.Id.Namespace, name: instance.Name, node: instance.NodeId()} rtts := rttByInstance[id] @@ -191,50 +225,128 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model rttByInstance[id] = rtts }) loadContainer("container_net_tcp_listen_info", func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { - ip, port, err := net.SplitHostPort(metric.Labels["listen_addr"]) + ipStr, port, err := net.SplitHostPort(metric.Labels["listen_addr"]) if err != nil { klog.Warningf("failed to split %s to ip:port pair: %s", metric.Labels["listen_addr"], err) return } isActive := metric.Values.Last() == 1 - l := model.Listen{IP: ip, Port: port, Proxied: metric.Labels["proxy"] != ""} + l := model.Listen{IP: ipStr, Port: port, Proxied: metric.Labels["proxy"] != ""} if !instance.TcpListens[l] { instance.TcpListens[l] = isActive } + if ip := net.ParseIP(ipStr); ip.IsLoopback() { + if instance.Node != nil { + l.IP = instance.NodeName() + instancesByListen[l] = instance + } + } else { + instancesByListen[l] = instance + } }) - loadConnection := func(queryName string, f func(connection *model.Connection, metric *model.MetricValues)) { + getInstanceByDest := func(m *model.MetricValues) *model.Instance { + remoteIP, remotePort, err := net.SplitHostPort(m.ActualDestination) + if err != nil { + return nil + } + l := model.Listen{IP: remoteIP, Port: remotePort, Proxied: true} + if ip := net.ParseIP(remoteIP); ip.IsLoopback() { + return nil + } + if instance := instancesByListen[l]; instance != nil { + return instance + } + l.Proxied = false + if instance := instancesByListen[l]; instance != nil { + return instance + } + l.Port = "0" + return instancesByListen[l] + } + + loadConnection := func(queryName string, f func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues)) { loadContainer(queryName, func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { - conn := getOrCreateConnection(instance, container.Name, metric) + conn := getOrCreateConnection(instance, metric) if conn != nil { - f(conn, metric) + f(instance, conn, metric) } }) } - loadConnection("container_net_tcp_successful_connects", func(connection *model.Connection, metric *model.MetricValues) { - connection.SuccessfulConnections = merge(connection.SuccessfulConnections, metric.Values, timeseries.Any) + loadConnection("container_net_tcp_successful_connects", func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { + connection.SuccessfulConnections = merge(connection.SuccessfulConnections, metric.Values, timeseries.NanSum) }) - loadConnection("container_net_tcp_connection_time_seconds", func(connection *model.Connection, metric *model.MetricValues) { - connection.ConnectionTime = merge(connection.ConnectionTime, metric.Values, timeseries.Any) + loadConnection("container_net_tcp_connection_time_seconds", func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { + connection.ConnectionTime = merge(connection.ConnectionTime, metric.Values, timeseries.NanSum) }) - loadConnection("container_net_tcp_bytes_sent", func(connection *model.Connection, metric *model.MetricValues) { - connection.BytesSent = merge(connection.BytesSent, metric.Values, timeseries.Any) + + regionAz := func(instance *model.Instance, metric *model.MetricValues) (string, string, string, string, *model.Instance) { + srcAZ := metric.Labels["az"] + srcRegion := metric.Labels["region"] + var dstRegion, dstAZ string + var destInstance *model.Instance + + if srcAZ == "" && instance.Node != nil { + srcAZ = instance.Node.AvailabilityZone.Value() + srcRegion = instance.Node.Region.Value() + } + if destInstance = getInstanceByDest(metric); destInstance != nil && destInstance.Node != nil { + dstRegion = destInstance.Node.Region.Value() + dstAZ = destInstance.Node.AvailabilityZone.Value() + } + return srcRegion, srcAZ, dstRegion, dstAZ, destInstance + } + + loadConnection("container_net_tcp_bytes_sent", func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { + connection.BytesSent = merge(connection.BytesSent, metric.Values, timeseries.NanSum) + + srcRegion, srcAZ, dstRegion, dstAZ, destInstance := regionAz(instance, metric) + if dstRegion != "" && dstRegion == srcRegion && srcAZ != "" && dstAZ != "" { + if srcAZ == dstAZ { + return + } + instance.Owner.TrafficStats.CrossAZEgress = merge(instance.Owner.TrafficStats.CrossAZEgress, metric.Values, timeseries.NanSum) + destInstance.Owner.TrafficStats.CrossAZIngress = merge(destInstance.Owner.TrafficStats.CrossAZIngress, metric.Values, timeseries.NanSum) + return + } + dstIp, _, err := net.SplitHostPort(metric.Destination) + if err != nil { + return + } + ip, err := netaddr.ParseIP(dstIp) + switch { + case err != nil: //fqdn + case utils.IsIpExternal(ip): + default: + return + } + instance.Owner.TrafficStats.InternetEgress = merge(instance.Owner.TrafficStats.InternetEgress, metric.Values, timeseries.NanSum) }) - loadConnection("container_net_tcp_bytes_received", func(connection *model.Connection, metric *model.MetricValues) { - connection.BytesReceived = merge(connection.BytesReceived, metric.Values, timeseries.Any) + + loadConnection("container_net_tcp_bytes_received", func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { + connection.BytesReceived = merge(connection.BytesReceived, metric.Values, timeseries.NanSum) + srcRegion, srcAZ, dstRegion, dstAZ, destInstance := regionAz(instance, metric) + if dstRegion != "" && dstRegion == srcRegion && srcAZ != "" && dstAZ != "" { + if srcAZ == dstAZ { + return + } + instance.Owner.TrafficStats.CrossAZIngress = merge(instance.Owner.TrafficStats.CrossAZIngress, metric.Values, timeseries.NanSum) + destInstance.Owner.TrafficStats.CrossAZEgress = merge(destInstance.Owner.TrafficStats.CrossAZEgress, metric.Values, timeseries.NanSum) + return + } }) - loadConnection("container_net_tcp_failed_connects", func(connection *model.Connection, metric *model.MetricValues) { - connection.FailedConnections = merge(connection.FailedConnections, metric.Values, timeseries.Any) + loadConnection("container_net_tcp_failed_connects", func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { + connection.FailedConnections = merge(connection.FailedConnections, metric.Values, timeseries.NanSum) }) - loadConnection("container_net_tcp_active_connections", func(connection *model.Connection, metric *model.MetricValues) { - connection.Active = merge(connection.Active, metric.Values, timeseries.Any) + loadConnection("container_net_tcp_active_connections", func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { + connection.Active = merge(connection.Active, metric.Values, timeseries.NanSum) }) - loadConnection("container_net_tcp_retransmits", func(connection *model.Connection, metric *model.MetricValues) { - connection.Retransmissions = merge(connection.Retransmissions, metric.Values, timeseries.Any) + loadConnection("container_net_tcp_retransmits", func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { + connection.Retransmissions = merge(connection.Retransmissions, metric.Values, timeseries.NanSum) }) loadL7RequestsCount := func(queryName string, protocol model.Protocol) { - loadConnection(queryName, func(connection *model.Connection, metric *model.MetricValues) { + loadConnection(queryName, func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { switch protocol { case model.ProtocolRabbitmq, model.ProtocolNats: protocol += model.Protocol("-" + metric.Labels["method"]) @@ -260,23 +372,23 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model loadL7RequestsCount("container_zookeeper_requests_count", model.ProtocolZookeeper) loadL7RequestsLatency := func(queryName string, protocol model.Protocol) { - loadConnection(queryName, func(connection *model.Connection, metric *model.MetricValues) { - connection.RequestsLatency[protocol] = merge(connection.RequestsLatency[protocol], metric.Values, timeseries.Any) + loadConnection(queryName, func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { + connection.RequestsLatency[protocol] = merge(connection.RequestsLatency[protocol], metric.Values, timeseries.NanSum) }) } - loadL7RequestsLatency("container_http_requests_latency", model.ProtocolHttp) - loadL7RequestsLatency("container_postgres_queries_latency", model.ProtocolPostgres) - loadL7RequestsLatency("container_mysql_queries_latency", model.ProtocolMysql) - loadL7RequestsLatency("container_mongo_queries_latency", model.ProtocolMongodb) - loadL7RequestsLatency("container_redis_queries_latency", model.ProtocolRedis) - loadL7RequestsLatency("container_memcached_queries_latency", model.ProtocolMemcached) - loadL7RequestsLatency("container_kafka_requests_latency", model.ProtocolKafka) - loadL7RequestsLatency("container_cassandra_queries_latency", model.ProtocolCassandra) - loadL7RequestsLatency("container_clickhouse_queries_latency", model.ProtocolClickhouse) - loadL7RequestsLatency("container_zookeeper_requests_latency", model.ProtocolZookeeper) + loadL7RequestsLatency("container_http_requests_latency_total", model.ProtocolHttp) + loadL7RequestsLatency("container_postgres_queries_latency_total", model.ProtocolPostgres) + loadL7RequestsLatency("container_mysql_queries_latency_total", model.ProtocolMysql) + loadL7RequestsLatency("container_mongo_queries_latency_total", model.ProtocolMongodb) + loadL7RequestsLatency("container_redis_queries_latency_total", model.ProtocolRedis) + loadL7RequestsLatency("container_memcached_queries_latency_total", model.ProtocolMemcached) + loadL7RequestsLatency("container_kafka_requests_latency_total", model.ProtocolKafka) + loadL7RequestsLatency("container_cassandra_queries_latency_total", model.ProtocolCassandra) + loadL7RequestsLatency("container_clickhouse_queries_latency_total", model.ProtocolClickhouse) + loadL7RequestsLatency("container_zookeeper_requests_latency_total", model.ProtocolZookeeper) loadL7RequestsHistogram := func(queryName string, protocol model.Protocol) { - loadConnection(queryName, func(connection *model.Connection, metric *model.MetricValues) { + loadConnection(queryName, func(instance *model.Instance, connection *model.Connection, metric *model.MetricValues) { le, err := strconv.ParseFloat(metric.Labels["le"], 32) if err != nil { klog.Warningln(err) @@ -299,6 +411,26 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model loadL7RequestsHistogram("container_clickhouse_queries_histogram", model.ProtocolClickhouse) loadL7RequestsHistogram("container_zookeeper_requests_histogram", model.ProtocolZookeeper) + loadInstanceByDest := func(queryName string, f func(instance *model.Instance, m *model.MetricValues)) { + ms := metrics[queryName] + for _, m := range ms { + if instance := getInstanceByDest(m); instance != nil { + f(instance, m) + } + } + } + loadInstanceByDest("l7_requests_by_dest", func(instance *model.Instance, m *model.MetricValues) { + status := m.Labels["status"] + if model.IsRequestStatusFailed(status) { + instance.Requests.Failed = merge(instance.Requests.Failed, m.Values, timeseries.NanSum) + } else { + instance.Requests.Ok = merge(instance.Requests.Ok, m.Values, timeseries.NanSum) + } + }) + loadInstanceByDest("l7_total_latency_by_dest", func(instance *model.Instance, m *model.MetricValues) { + instance.Requests.TotalLatency = merge(instance.Requests.TotalLatency, m.Values, timeseries.NanSum) + }) + loadContainer("container_dns_requests_total", func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { r := model.DNSRequest{ Type: metric.Labels["request_type"], @@ -308,12 +440,12 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model return } status := metric.Labels["status"] - byStatus := container.DNSRequests[r] + byStatus := instance.Owner.DNSRequests[r] if byStatus == nil { byStatus = map[string]*timeseries.TimeSeries{} - container.DNSRequests[r] = byStatus + instance.Owner.DNSRequests[r] = byStatus } - byStatus[status] = merge(byStatus[status], metric.Values, timeseries.Any) + byStatus[status] = merge(byStatus[status], metric.Values, timeseries.NanSum) }) loadContainer("container_dns_requests_latency", func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { le, err := strconv.ParseFloat(metric.Labels["le"], 32) @@ -321,7 +453,7 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model klog.Warningln(err) return } - container.DNSRequestsHistogram[float32(le)] = merge(container.DNSRequestsHistogram[float32(le)], metric.Values, timeseries.Any) + instance.Owner.DNSRequestsHistogram[float32(le)] = merge(instance.Owner.DNSRequestsHistogram[float32(le)], metric.Values, timeseries.Any) }) loadVolume := func(queryName string, f func(volume *model.Volume, metric *model.MetricValues)) { @@ -337,21 +469,17 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model volume.UsedBytes = merge(volume.UsedBytes, metric.Values, timeseries.Any) }) - instancesByListen := map[model.Listen]*model.Instance{} - for _, app := range w.Applications { - for _, instance := range app.Instances { - for l := range instance.TcpListens { - if ip := net.ParseIP(l.IP); ip.IsLoopback() { - if instance.Node != nil { - l.IP = instance.NodeName() - instancesByListen[l] = instance - } - } else { - instancesByListen[l] = instance - } - } - } + loadGPU := func(queryName string, f func(g *model.InstanceGPUUsage, metric *model.MetricValues)) { + loadContainer(queryName, func(instance *model.Instance, container *model.Container, metric *model.MetricValues) { + f(getOrCreateInstanceGPU(instance, metric), metric) + }) } + loadGPU("container_gpu_usage_percent", func(g *model.InstanceGPUUsage, metric *model.MetricValues) { + g.UsageAverage = merge(g.UsageAverage, metric.Values, timeseries.Any) + }) + loadGPU("container_gpu_memory_usage_percent", func(g *model.InstanceGPUUsage, metric *model.MetricValues) { + g.MemoryUsageAverage = merge(g.MemoryUsageAverage, metric.Values, timeseries.Any) + }) for _, app := range w.Applications { // lookup remote instance by listen for _, instance := range app.Instances { @@ -388,6 +516,9 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model } } + isEmpty := func(ts *timeseries.TimeSeries) bool { + return ts.IsEmpty() || ts.Reduce(timeseries.NanSum) == 0. + } for _, app := range w.Applications { // creating ApplicationKindExternalService for unknown remote instances for _, instance := range app.Instances { for _, u := range instance.Upstreams { @@ -406,6 +537,9 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model appId.Name = svc.Name } } else { + if isEmpty(u.SuccessfulConnections) && isEmpty(u.Active) && isEmpty(u.FailedConnections) { + continue + } if u.ActualRemoteIP == "" && net.ParseIP(u.ServiceRemoteIP) == nil { appId.Name = u.ServiceRemoteIP } else if fqdns := ip2fqdn[u.ServiceRemoteIP]; fqdns != nil && fqdns.Len() > 0 { @@ -426,7 +560,7 @@ func (c *Constructor) loadContainers(w *model.World, metrics map[string][]*model } } -func getOrCreateConnection(instance *model.Instance, container string, m *model.MetricValues) *model.Connection { +func getOrCreateConnection(instance *model.Instance, m *model.MetricValues) *model.Connection { if instance.Owner.Id.Name == "docker" { // ignore docker-proxy's connections return nil } @@ -453,7 +587,6 @@ func getOrCreateConnection(instance *model.Instance, container string, m *model. ActualRemotePort: actualPort, ServiceRemoteIP: serviceIP, ServiceRemotePort: servicePort, - Container: container, RequestsCount: map[model.Protocol]map[string]*timeseries.TimeSeries{}, RequestsLatency: map[model.Protocol]*timeseries.TimeSeries{}, @@ -481,6 +614,21 @@ func getOrCreateInstanceVolume(instance *model.Instance, m *model.MetricValues) return volume } +func getOrCreateInstanceGPU(instance *model.Instance, m *model.MetricValues) *model.InstanceGPUUsage { + uuid := m.Labels["gpu_uuid"] + g := instance.GPUUsage[uuid] + if g == nil { + g = &model.InstanceGPUUsage{} + instance.GPUUsage[uuid] = g + } + if instance.Node != nil && instance.Node.GPUs != nil { + if gpu := instance.Node.GPUs[uuid]; gpu != nil { + gpu.Instances[instance.Name] = instance + } + } + return g +} + func externalServiceName(port string) string { service := "" switch port { diff --git a/constructor/elasticache.go b/constructor/elasticache.go index fab1d76c5..ed1fc83c0 100644 --- a/constructor/elasticache.go +++ b/constructor/elasticache.go @@ -66,7 +66,7 @@ func (c *Constructor) loadElasticache(w *model.World, metrics map[string][]*mode } if c.pricing != nil { for _, instance := range ecInstancesById { - instance.Node.Price = c.pricing.GetNodePrice(instance.Node) + instance.Node.Price = c.pricing.GetNodePrice(nil, instance.Node) } } } diff --git a/constructor/fqdn.go b/constructor/fqdn.go index 4266c02f6..afd6ad48f 100644 --- a/constructor/fqdn.go +++ b/constructor/fqdn.go @@ -5,14 +5,22 @@ import ( "github.com/coroot/coroot/utils" ) -func loadFQDNs(metrics map[string][]*model.MetricValues, ip2fqdn map[string]*utils.StringSet) { +func loadFQDNs(metrics map[string][]*model.MetricValues, ip2fqdn, fqdn2ip map[string]*utils.StringSet) { + var ip, fqdn string for _, m := range metrics["ip_to_fqdn"] { - ip := m.Labels["ip"] + ip = m.Labels["ip"] + fqdn = m.Labels["fqdn"] v := ip2fqdn[ip] if v == nil { v = utils.NewStringSet() ip2fqdn[ip] = v } - v.Add(m.Labels["fqdn"]) + v.Add(fqdn) + v = fqdn2ip[fqdn] + if v == nil { + v = utils.NewStringSet() + fqdn2ip[fqdn] = v + } + v.Add(ip) } } diff --git a/constructor/k8s.go b/constructor/k8s.go index 9cc3b9246..edcbb3d5b 100644 --- a/constructor/k8s.go +++ b/constructor/k8s.go @@ -23,6 +23,7 @@ type serviceId struct { func loadKubernetesMetadata(w *model.World, metrics map[string][]*model.MetricValues, servicesByClusterIP map[string]*model.Service) { pods := podInfo(w, metrics["kube_pod_info"]) podLabels(metrics["kube_pod_labels"], pods) + podAnnotations(metrics["kube_pod_annotations"], pods) appsByPodIP := map[string]*model.Application{} for _, pod := range pods { @@ -111,6 +112,9 @@ func loadApplications(w *model.World, metrics map[string][]*model.MetricValues) case strings.HasPrefix(queryName, "kube_daemonset_"): kind = model.ApplicationKindDaemonSet nameLabel = "daemonset" + case strings.HasPrefix(queryName, "kube_cronjob_"): + kind = model.ApplicationKindCronJob + nameLabel = "cronjob" default: continue } @@ -122,6 +126,8 @@ func loadApplications(w *model.World, metrics map[string][]*model.MetricValues) switch queryName { case "kube_deployment_spec_replicas", "kube_statefulset_replicas", "kube_daemonset_status_desired_number_scheduled": app.DesiredInstances = merge(app.DesiredInstances, m.Values, timeseries.Any) + case "kube_deployment_annotations", "kube_statefulset_annotations", "kube_daemonset_annotations", "kube_cronjob_annotations": + app.Annotations.UpdateFromLabels(m.Labels, m.Values) } } } @@ -212,7 +218,6 @@ func podLabels(metrics []*model.MetricValues, pods map[string]*model.Instance) { } instance := pods[uid] if instance == nil { - //klog.Warningln("unknown pod:", uid, m.Labels["pod"], m.Labels["namespace"]) continue } cluster, role := "", "" @@ -236,6 +241,8 @@ func podLabels(metrics []*model.MetricValues, pods map[string]*model.Instance) { role = m.Labels["label_role"] case m.Labels["label_app_kubernetes_io_managed_by"] == "percona-server-mongodb-operator": cluster = m.Labels["label_app_kubernetes_io_instance"] + case m.Labels["label_app_kubernetes_io_managed_by"] == "percona-xtradb-cluster-operator": + cluster = m.Labels["label_app_kubernetes_io_instance"] case strings.HasPrefix(m.Labels["label_helm_sh_chart"], "mongodb"): if m.Labels["label_app_kubernetes_io_name"] != "" && m.Labels["label_app_kubernetes_io_instance"] != "" { cluster = m.Labels["label_app_kubernetes_io_instance"] + "-" + m.Labels["label_app_kubernetes_io_name"] @@ -271,6 +278,20 @@ func podLabels(metrics []*model.MetricValues, pods map[string]*model.Instance) { } } +func podAnnotations(metrics []*model.MetricValues, pods map[string]*model.Instance) { + for _, m := range metrics { + uid := m.Labels["uid"] + if uid == "" { + continue + } + instance := pods[uid] + if instance == nil { + continue + } + instance.Annotations.UpdateFromLabels(m.Labels, m.Values) + } +} + func podStatus(queryName string, metrics []*model.MetricValues, pods map[string]*model.Instance) { for _, m := range metrics { uid := m.Labels["uid"] diff --git a/constructor/logs.go b/constructor/logs.go index 8da00c062..4dc9185e6 100644 --- a/constructor/logs.go +++ b/constructor/logs.go @@ -11,11 +11,11 @@ import ( ) func logMessage(instance *model.Instance, metric *model.MetricValues, pjs promJobStatuses) { - level := model.LogLevel(metric.Labels["level"]) - msgs := instance.Owner.LogMessages[level] + severity := model.SeverityFromString(metric.Labels["level"]) + msgs := instance.Owner.LogMessages[severity] if msgs == nil { msgs = &model.LogMessages{} - instance.Owner.LogMessages[level] = msgs + instance.Owner.LogMessages[severity] = msgs } values := timeseries.Increase(metric.Values, pjs.get(metric.Labels)) msgs.Messages = merge(msgs.Messages, values, timeseries.NanSum) @@ -79,13 +79,13 @@ func (c *Constructor) loadApplicationLogs(w *model.World, metrics map[string][]* continue } if app.LogMessages == nil { - app.LogMessages = map[model.LogLevel]*model.LogMessages{} + app.LogMessages = map[model.Severity]*model.LogMessages{} } - level := model.LogLevel(metric.Labels["level"]) - msgs := app.LogMessages[level] + severity := model.SeverityFromString(metric.Labels["level"]) + msgs := app.LogMessages[severity] if msgs == nil { msgs = &model.LogMessages{} - app.LogMessages[level] = msgs + app.LogMessages[severity] = msgs } msgs.Messages = merge(msgs.Messages, metric.Values, timeseries.NanSum) similar := metric.Labels["similar"] diff --git a/constructor/nodes.go b/constructor/nodes.go index 689c4a20b..23262af42 100644 --- a/constructor/nodes.go +++ b/constructor/nodes.go @@ -99,6 +99,8 @@ func (c *Constructor) loadNodes(w *model.World, metrics map[string][]*model.Metr nodeDisk(node, queryName, m) } else if strings.HasPrefix(queryName, "node_net_") { nodeInterface(node, queryName, m) + } else if strings.HasPrefix(queryName, "node_gpu") { + nodeGPU(node, queryName, m) } } } @@ -118,7 +120,7 @@ func (c *Constructor) loadNodes(w *model.World, metrics map[string][]*model.Metr } if c.pricing != nil { for _, n := range w.Nodes { - n.Price = c.pricing.GetNodePrice(n) + n.Price = c.pricing.GetNodePrice(c.project.Settings.CustomCloudPricing, n) n.DataTransferPrice = c.pricing.GetDataTransferPrice(n) } } @@ -174,3 +176,35 @@ func nodeInterface(node *model.Node, queryName string, m *model.MetricValues) { stat.TxBytes = merge(stat.TxBytes, m.Values, timeseries.Any) } } + +func nodeGPU(node *model.Node, queryName string, m *model.MetricValues) { + uuid := m.Labels["gpu_uuid"] + gpu := node.GPUs[uuid] + if gpu == nil { + gpu = &model.GPU{ + UUID: uuid, + Instances: map[string]*model.Instance{}, + } + node.GPUs[uuid] = gpu + } + switch queryName { + case "node_gpu_info": + gpu.Name.Update(m.Values, m.Labels["name"]) + case "node_gpu_memory_total_bytes": + gpu.TotalMemory = merge(gpu.TotalMemory, m.Values, timeseries.Any) + case "node_gpu_memory_used_bytes": + gpu.UsedMemory = merge(gpu.UsedMemory, m.Values, timeseries.Any) + case "node_gpu_memory_utilization_percent_avg": + gpu.MemoryUsageAverage = merge(gpu.MemoryUsageAverage, m.Values, timeseries.Any) + case "node_gpu_memory_utilization_percent_peak": + gpu.MemoryUsagePeak = merge(gpu.MemoryUsagePeak, m.Values, timeseries.Any) + case "node_gpu_temperature_celsius": + gpu.Temperature = merge(gpu.Temperature, m.Values, timeseries.Any) + case "node_gpu_power_usage_watts": + gpu.PowerWatts = merge(gpu.PowerWatts, m.Values, timeseries.Any) + case "node_gpu_utilization_percent_avg": + gpu.UsageAverage = merge(gpu.UsageAverage, m.Values, timeseries.Any) + case "node_gpu_utilization_percent_peak": + gpu.UsagePeak = merge(gpu.UsagePeak, m.Values, timeseries.Any) + } +} diff --git a/constructor/queries.go b/constructor/queries.go index 4b44ccd7d..9066e791b 100644 --- a/constructor/queries.go +++ b/constructor/queries.go @@ -3,20 +3,23 @@ package constructor import ( "fmt" "slices" + "strconv" "strings" + "time" "github.com/coroot/coroot/db" "github.com/coroot/coroot/model" "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" promModel "github.com/prometheus/common/model" - "inet.af/netaddr" + "golang.org/x/exp/maps" + "k8s.io/klog" ) const ( - qApplicationCustomSLI = "application_custom_sli" - qRecordingRuleApplicationLogMessages = "rr_application_log_messages" + qApplicationCustomSLI = "application_custom_sli" + qRecordingRuleApplicationLogMessages = "rr_application_log_messages" qRecordingRuleApplicationTCPSuccessful = "rr_connection_tcp_successful" qRecordingRuleApplicationTCPActive = "rr_connection_tcp_active" qRecordingRuleApplicationTCPFailed = "rr_connection_tcp_failed" @@ -27,15 +30,14 @@ const ( qRecordingRuleApplicationNetLatency = "rr_connection_net_latency" qRecordingRuleApplicationL7Requests = "rr_connection_l7_requests" qRecordingRuleApplicationL7Latency = "rr_connection_l7_latency" - - qRecordingRuleApplicationTraffic = "rr_application_traffic" - - qRecordingRuleInstanceL7Requests = "rr_instance_l7_requests" - qRecordingRuleInstanceL7Latency = "rr_instance_l7_latency" - - qRecordingRuleApplicationL7Histogram = "rr_application_l7_histogram" + qRecordingRuleApplicationTraffic = "rr_application_traffic" + qRecordingRuleApplicationL7Histogram = "rr_application_l7_histogram" + qRecordingRuleApplicationCategories = "rr_application_categories" + qRecordingRuleApplicationSLO = "rr_application_slo" ) +var applicationAnnotations = maps.Keys(model.ApplicationAnnotationLabels) + var qConnectionAggregations = []string{ qRecordingRuleApplicationTCPSuccessful, qRecordingRuleApplicationTCPActive, @@ -47,8 +49,6 @@ var qConnectionAggregations = []string{ qRecordingRuleApplicationNetLatency, qRecordingRuleApplicationL7Requests, qRecordingRuleApplicationL7Latency, - qRecordingRuleInstanceL7Requests, - qRecordingRuleInstanceL7Latency, qRecordingRuleApplicationTraffic, } @@ -73,7 +73,7 @@ func Q(name, query string, labels ...string) Query { } func qItoI(name, query string, labels ...string) Query { - q := Q(name, query, labels...) + q := Q(name, query, append(labels, "app_id")...) q.InstanceToInstance = true return q } @@ -102,6 +102,22 @@ func qFargateContainer(name, query string, labels ...string) Query { return Q(name, query, slices.Concat([]string{"kubernetes_io_hostname", "namespace", "pod", "container"}, labels)...) } +func l7Req(metric string) string { + return fmt.Sprintf(`sum by(app_id, destination, actual_destination, status) (rate(%s{app_id!=""}[$RANGE])) or rate(%s{app_id=""}[$RANGE])`, metric, metric) +} + +func l7ReqWithMethod(metric string) string { + return fmt.Sprintf(`sum by(app_id, destination, actual_destination, status, method) (rate(%s{app_id!=""}[$RANGE])) or rate(%s{app_id=""}[$RANGE])`, metric, metric) +} + +func l7Latency(metric string) string { + return fmt.Sprintf(`sum by(app_id, destination, actual_destination) (rate(%s{app_id!=""}[$RANGE])) or rate(%s{app_id=""}[$RANGE])`, metric, metric) +} + +func l7Histogram(metric string) string { + return fmt.Sprintf(`sum by(app_id, destination, actual_destination, le) (rate(%s{app_id!=""}[$RANGE])) or rate(%s{app_id=""}[$RANGE])`, metric, metric) +} + var QUERIES = []Query{ Q("node_agent_info", `node_agent_info`, "version"), @@ -128,6 +144,15 @@ var QUERIES = []Query{ Q("node_net_ip", `node_net_interface_ip`, "interface", "ip"), Q("node_net_rx_bytes", `rate(node_net_received_bytes_total[$RANGE])`, "interface"), Q("node_net_tx_bytes", `rate(node_net_transmitted_bytes_total[$RANGE])`, "interface"), + Q("node_gpu_info", `node_gpu_info`, "gpu_uuid", "name"), + Q("node_gpu_memory_total_bytes", `node_resources_gpu_memory_total_bytes`, "gpu_uuid"), + Q("node_gpu_memory_used_bytes", `node_resources_gpu_memory_used_bytes`, "gpu_uuid"), + Q("node_gpu_memory_utilization_percent_avg", `node_resources_gpu_memory_utilization_percent_avg`, "gpu_uuid"), + Q("node_gpu_memory_utilization_percent_peak", `node_resources_gpu_memory_utilization_percent_peak`, "gpu_uuid"), + Q("node_gpu_utilization_percent_avg", `node_resources_gpu_utilization_percent_avg`, "gpu_uuid"), + Q("node_gpu_utilization_percent_peak", `node_resources_gpu_utilization_percent_peak`, "gpu_uuid"), + Q("node_gpu_temperature_celsius", `node_resources_gpu_temperature_celsius`, "gpu_uuid"), + Q("node_gpu_power_usage_watts", `node_resources_gpu_power_usage_watts`, "gpu_uuid"), Q("ip_to_fqdn", `sum by(fqdn, ip) (ip_to_fqdn)`, "ip", "fqdn"), @@ -150,8 +175,13 @@ var QUERIES = []Query{ Q("kube_deployment_spec_replicas", `kube_deployment_spec_replicas`, "namespace", "deployment"), Q("kube_daemonset_status_desired_number_scheduled", `kube_daemonset_status_desired_number_scheduled`, "namespace", "daemonset"), Q("kube_statefulset_replicas", `kube_statefulset_replicas`, "namespace", "statefulset"), + Q("kube_deployment_annotations", `kube_deployment_annotations`, append(applicationAnnotations, "namespace", "deployment")...), + Q("kube_statefulset_annotations", `kube_statefulset_annotations`, append(applicationAnnotations, "namespace", "statefulset")...), + Q("kube_daemonset_annotations", `kube_daemonset_annotations`, append(applicationAnnotations, "namespace", "daemonset")...), + Q("kube_cronjob_annotations", `kube_cronjob_annotations`, append(applicationAnnotations, "namespace", "cronjob")...), qPod("kube_pod_info", `kube_pod_info`, "namespace", "pod", "created_by_name", "created_by_kind", "node", "pod_ip", "host_ip"), + qPod("kube_pod_annotations", hasNotEmptyLabel("kube_pod_annotations", applicationAnnotations), applicationAnnotations...), qPod("kube_pod_labels", `kube_pod_labels`, "label_postgres_operator_crunchydata_com_cluster", "label_postgres_operator_crunchydata_com_role", "label_cluster_name", "label_team", "label_application", "label_spilo_role", @@ -165,7 +195,7 @@ var QUERIES = []Query{ "label_app_kubernetes_io_name", "label_app_kubernetes_io_component", "label_app_kubernetes_io_part_of", ), - qPod("kube_pod_status_phase", `kube_pod_status_phase`, "phase"), + qPod("kube_pod_status_phase", `kube_pod_status_phase > 0`, "phase"), qPod("kube_pod_status_ready", `kube_pod_status_ready{condition="true"}`), qPod("kube_pod_status_scheduled", `kube_pod_status_scheduled{condition="true"} > 0`), qPod("kube_pod_init_container_info", `kube_pod_init_container_info`, "namespace", "pod", "container"), @@ -191,54 +221,59 @@ var QUERIES = []Query{ Q("container_restarts", `container_restarts_total % 10000000`, "job", "instance"), Q("container_volume_size", `container_resources_disk_size_bytes`, "mount_point", "volume", "device"), Q("container_volume_used", `container_resources_disk_used_bytes`, "mount_point", "volume", "device"), + Q("container_gpu_usage_percent", `container_resources_gpu_usage_percent`, "gpu_uuid"), + Q("container_gpu_memory_usage_percent", `container_resources_gpu_memory_usage_percent`, "gpu_uuid"), Q("container_net_tcp_listen_info", `container_net_tcp_listen_info`, "listen_addr", "proxy"), - qItoI("container_net_latency", `container_net_latency_seconds`), - qItoI("container_net_tcp_successful_connects", `rate(container_net_tcp_successful_connects_total[$RANGE])`), - qItoI("container_net_tcp_failed_connects", `rate(container_net_tcp_failed_connects_total[$RANGE])`), - qItoI("container_net_tcp_active_connections", `container_net_tcp_active_connections`), - qItoI("container_net_tcp_connection_time_seconds", `rate(container_net_tcp_connection_time_seconds_total[$RANGE])`), - qItoI("container_net_tcp_bytes_sent", `rate(container_net_tcp_bytes_sent_total[$RANGE])`), - qItoI("container_net_tcp_bytes_received", `rate(container_net_tcp_bytes_received_total[$RANGE])`), - qItoI("container_net_tcp_retransmits", `rate(container_net_tcp_retransmits_total[$RANGE])`), + qItoI("container_net_latency", `avg by(app_id, destination_ip) (container_net_latency_seconds{app_id!=""}) or container_net_latency_seconds{app_id=""}`), + qItoI("container_net_tcp_successful_connects", `sum by(app_id, destination, actual_destination) (rate(container_net_tcp_successful_connects_total{app_id!=""}[$RANGE])) or rate(container_net_tcp_successful_connects_total{app_id=""}[$RANGE])`), + qItoI("container_net_tcp_failed_connects", `sum by(app_id, destination, actual_destination) (rate(container_net_tcp_failed_connects_total{app_id!=""}[$RANGE])) or rate(container_net_tcp_failed_connects_total{app_id=""}[$RANGE])`), + qItoI("container_net_tcp_active_connections", `sum by(app_id, destination, actual_destination) (container_net_tcp_active_connections{app_id!=""}) or container_net_tcp_active_connections{app_id=""}`), + qItoI("container_net_tcp_connection_time_seconds", `sum by(app_id, destination, actual_destination) (rate(container_net_tcp_connection_time_seconds_total{app_id!=""}[$RANGE])) or rate(container_net_tcp_connection_time_seconds_total{app_id=""}[$RANGE])`), + qItoI("container_net_tcp_bytes_sent", `sum by(app_id, destination, actual_destination, az, region) (rate(container_net_tcp_bytes_sent_total{app_id!=""}[$RANGE])) or rate(container_net_tcp_bytes_sent_total{app_id=""}[$RANGE])`, "region", "az"), + qItoI("container_net_tcp_bytes_received", `sum by(app_id, destination, actual_destination, az, region) (rate(container_net_tcp_bytes_received_total{app_id!=""}[$RANGE])) or rate(container_net_tcp_bytes_received_total{app_id=""}[$RANGE])`, "region", "az"), + qItoI("container_net_tcp_retransmits", `sum by(app_id, destination, actual_destination) (rate(container_net_tcp_retransmits_total{app_id!=""}[$RANGE])) or rate(container_net_tcp_retransmits_total{app_id=""}[$RANGE])`), Q("container_log_messages", `container_log_messages_total % 10000000`, "level", "pattern_hash", "sample", "job", "instance"), - qItoI("container_http_requests_count", `rate(container_http_requests_total[$RANGE])`, "status"), - qItoI("container_http_requests_latency", `rate(container_http_requests_duration_seconds_total_sum [$RANGE]) / rate(container_http_requests_duration_seconds_total_count [$RANGE])`), - qItoI("container_http_requests_histogram", `rate(container_http_requests_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_postgres_queries_count", `rate(container_postgres_queries_total[$RANGE])`, "status"), - qItoI("container_postgres_queries_latency", `rate(container_postgres_queries_duration_seconds_total_sum [$RANGE]) / rate(container_postgres_queries_duration_seconds_total_count [$RANGE])`), - qItoI("container_postgres_queries_histogram", `rate(container_postgres_queries_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_redis_queries_count", `rate(container_redis_queries_total[$RANGE])`, "status"), - qItoI("container_redis_queries_latency", `rate(container_redis_queries_duration_seconds_total_sum [$RANGE]) / rate(container_redis_queries_duration_seconds_total_count [$RANGE])`), - qItoI("container_redis_queries_histogram", `rate(container_redis_queries_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_memcached_queries_count", `rate(container_memcached_queries_total[$RANGE])`, "status"), - qItoI("container_memcached_queries_latency", `rate(container_memcached_queries_duration_seconds_total_sum [$RANGE]) / rate(container_memcached_queries_duration_seconds_total_count [$RANGE])`), - qItoI("container_memcached_queries_histogram", `rate(container_memcached_queries_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_mysql_queries_count", `rate(container_mysql_queries_total[$RANGE])`, "status"), - qItoI("container_mysql_queries_latency", `rate(container_mysql_queries_duration_seconds_total_sum [$RANGE]) / rate(container_mysql_queries_duration_seconds_total_count [$RANGE])`), - qItoI("container_mysql_queries_histogram", `rate(container_mysql_queries_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_mongo_queries_count", `rate(container_mongo_queries_total[$RANGE])`, "status"), - qItoI("container_mongo_queries_latency", `rate(container_mongo_queries_duration_seconds_total_sum [$RANGE]) / rate(container_mongo_queries_duration_seconds_total_count [$RANGE])`), - qItoI("container_mongo_queries_histogram", `rate(container_mongo_queries_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_kafka_requests_count", `rate(container_kafka_requests_total[$RANGE])`, "status"), - qItoI("container_kafka_requests_latency", `rate(container_kafka_requests_duration_seconds_total_sum [$RANGE]) / rate(container_kafka_requests_duration_seconds_total_count [$RANGE])`), - qItoI("container_kafka_requests_histogram", `rate(container_kafka_requests_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_cassandra_queries_count", `rate(container_cassandra_queries_total[$RANGE])`, "status"), - qItoI("container_cassandra_queries_latency", `rate(container_cassandra_queries_duration_seconds_total_sum [$RANGE]) / rate(container_cassandra_queries_duration_seconds_total_count [$RANGE])`), - qItoI("container_cassandra_queries_histogram", `rate(container_cassandra_queries_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_clickhouse_queries_count", `rate(container_clickhouse_queries_total[$RANGE])`, "status"), - qItoI("container_clickhouse_queries_latency", `rate(container_clickhouse_queries_duration_seconds_total_sum [$RANGE]) / rate(container_clickhouse_queries_duration_seconds_total_count [$RANGE])`), - qItoI("container_clickhouse_queries_histogram", `rate(container_clickhouse_queries_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_zookeeper_requests_count", `rate(container_zookeeper_requests_total[$RANGE])`, "status"), - qItoI("container_zookeeper_requests_latency", `rate(container_zookeeper_requests_duration_seconds_total_sum [$RANGE]) / rate(container_zookeeper_requests_duration_seconds_total_count [$RANGE])`), - qItoI("container_zookeeper_requests_histogram", `rate(container_zookeeper_requests_duration_seconds_total_bucket[$RANGE])`, "le"), - qItoI("container_rabbitmq_messages", `rate(container_rabbitmq_messages_total[$RANGE])`, "status", "method"), - qItoI("container_nats_messages", `rate(container_nats_messages_total[$RANGE])`, "status", "method"), - - Q("container_dns_requests_total", `rate(container_dns_requests_total[$RANGE])`, "request_type", "domain", "status"), - Q("container_dns_requests_latency", `rate(container_dns_requests_duration_seconds_total_bucket[$RANGE])`, "le"), + qItoI("container_http_requests_count", l7Req("container_http_requests_total"), "status"), + qItoI("container_http_requests_latency_total", l7Latency("container_http_requests_duration_seconds_total_sum")), + qItoI("container_http_requests_histogram", l7Histogram("container_http_requests_duration_seconds_total_bucket"), "le"), + qItoI("container_postgres_queries_count", l7Req("container_postgres_queries_total"), "status"), + qItoI("container_postgres_queries_latency_total", l7Latency("container_postgres_queries_duration_seconds_total_sum")), + qItoI("container_postgres_queries_histogram", l7Histogram("container_postgres_queries_duration_seconds_total_bucket"), "le"), + qItoI("container_redis_queries_count", l7Req("container_redis_queries_total"), "status"), + qItoI("container_redis_queries_latency_total", l7Latency("container_redis_queries_duration_seconds_total_sum")), + qItoI("container_redis_queries_histogram", l7Histogram("container_redis_queries_duration_seconds_total_bucket"), "le"), + qItoI("container_memcached_queries_count", l7Req("container_memcached_queries_total"), "status"), + qItoI("container_memcached_queries_latency_total", l7Latency("container_memcached_queries_duration_seconds_total_sum")), + qItoI("container_memcached_queries_histogram", l7Histogram("container_memcached_queries_duration_seconds_total_bucket"), "le"), + qItoI("container_mysql_queries_count", l7Req("container_mysql_queries_total"), "status"), + qItoI("container_mysql_queries_latency_total", l7Latency("container_mysql_queries_duration_seconds_total_sum")), + qItoI("container_mysql_queries_histogram", l7Histogram("container_mysql_queries_duration_seconds_total_bucket"), "le"), + qItoI("container_mongo_queries_count", l7Req("container_mongo_queries_total"), "status"), + qItoI("container_mongo_queries_latency_total", l7Latency("container_mongo_queries_duration_seconds_total_sum")), + qItoI("container_mongo_queries_histogram", l7Histogram("container_mongo_queries_duration_seconds_total_bucket"), "le"), + qItoI("container_kafka_requests_count", l7Req("container_kafka_requests_total"), "status"), + qItoI("container_kafka_requests_latency_total", l7Latency("container_kafka_requests_duration_seconds_total_sum")), + qItoI("container_kafka_requests_histogram", l7Histogram("container_kafka_requests_duration_seconds_total_bucket"), "le"), + qItoI("container_cassandra_queries_count", l7Req("container_cassandra_queries_total"), "status"), + qItoI("container_cassandra_queries_latency_total", l7Latency("container_cassandra_queries_duration_seconds_total_sum")), + qItoI("container_cassandra_queries_histogram", l7Histogram("container_cassandra_queries_duration_seconds_total_bucket"), "le"), + qItoI("container_clickhouse_queries_count", l7Req("container_clickhouse_queries_total"), "status"), + qItoI("container_clickhouse_queries_latency_total", l7Latency("container_clickhouse_queries_duration_seconds_total_sum")), + qItoI("container_clickhouse_queries_histogram", l7Histogram("container_clickhouse_queries_duration_seconds_total_bucket"), "le"), + qItoI("container_zookeeper_requests_count", l7Req("container_zookeeper_requests_total"), "status"), + qItoI("container_zookeeper_requests_latency_total", l7Latency("container_zookeeper_requests_duration_seconds_total_sum")), + qItoI("container_zookeeper_requests_histogram", l7Histogram("container_zookeeper_requests_duration_seconds_total_bucket"), "le"), + qItoI("container_rabbitmq_messages", l7ReqWithMethod("container_rabbitmq_messages_total"), "status", "method"), + qItoI("container_nats_messages", l7ReqWithMethod("container_nats_messages_total"), "status", "method"), + + Q("l7_requests_by_dest", "sum by(actual_destination, status) (rate(container_mongo_queries_total[$RANGE]) or rate(container_mysql_queries_total[$RANGE]))", "status"), + Q("l7_total_latency_by_dest", "sum by(actual_destination) (rate(container_mongo_queries_duration_seconds_total_sum[$RANGE]) or rate(container_mysql_queries_duration_seconds_total_sum[$RANGE]))"), + + Q("container_dns_requests_total", `sum by(app_id, request_type, domain, status) (rate(container_dns_requests_total{app_id!=""}[$RANGE])) or rate(container_dns_requests_total{app_id=""}[$RANGE])`, "app_id", "request_type", "domain", "status"), + Q("container_dns_requests_latency", `sum by(app_id, le) (rate(container_dns_requests_duration_seconds_total_bucket{app_id!=""}[$RANGE])) or rate(container_dns_requests_duration_seconds_total_bucket{app_id=""}[$RANGE]) `, "app_id", "le"), Q("aws_discovery_error", `aws_discovery_error`, "error"), qRDS("aws_rds_info", `aws_rds_info`, "cluster_id", "ipv4", "port", "engine", "engine_version", "instance_type", "storage_type", "region", "availability_zone", "multi_az"), @@ -335,21 +370,21 @@ var QUERIES = []Query{ Q("container_python_thread_lock_wait_time_seconds", `rate(container_python_thread_lock_wait_time_seconds[$RANGE])`), } -var RecordingRules = map[string]func(p *db.Project, w *model.World) []*model.MetricValues{ - qRecordingRuleApplicationLogMessages: func(p *db.Project, w *model.World) []*model.MetricValues { +var RecordingRules = map[string]func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues{ + qRecordingRuleApplicationLogMessages: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { var res []*model.MetricValues for _, app := range w.Applications { appId := app.Id.String() - for level, msgs := range app.LogMessages { + for severity, msgs := range app.LogMessages { if len(msgs.Patterns) == 0 { if msgs.Messages.Reduce(timeseries.NanSum) > 0 { - ls := model.Labels{"application": appId, "level": string(level)} + ls := model.Labels{"application": appId, "level": severity.String()} res = append(res, &model.MetricValues{Labels: ls, LabelsHash: promModel.LabelsToSignature(ls), Values: msgs.Messages}) } } else { for _, pattern := range msgs.Patterns { if pattern.Messages.Reduce(timeseries.NanSum) > 0 { - ls := model.Labels{"application": appId, "level": string(level)} + ls := model.Labels{"application": appId, "level": severity.String()} ls["multiline"] = fmt.Sprintf("%t", pattern.Multiline) ls["similar"] = strings.Join(pattern.SimilarPatternHashes.Items(), " ") ls["sample"] = pattern.Sample @@ -362,29 +397,29 @@ var RecordingRules = map[string]func(p *db.Project, w *model.World) []*model.Met } return res }, - qRecordingRuleApplicationTCPSuccessful: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationTCPSuccessful: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { return aggConnections(w, func(c *model.Connection) *timeseries.TimeSeries { return c.SuccessfulConnections }) }, - qRecordingRuleApplicationTCPActive: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationTCPActive: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { return aggConnections(w, func(c *model.Connection) *timeseries.TimeSeries { return c.Active }) }, - qRecordingRuleApplicationTCPFailed: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationTCPFailed: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { return aggConnections(w, func(c *model.Connection) *timeseries.TimeSeries { return c.FailedConnections }) }, - qRecordingRuleApplicationTCPConnectionTime: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationTCPConnectionTime: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { return aggConnections(w, func(c *model.Connection) *timeseries.TimeSeries { return c.ConnectionTime }) }, - qRecordingRuleApplicationTCPBytesSent: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationTCPBytesSent: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { return aggConnections(w, func(c *model.Connection) *timeseries.TimeSeries { return c.BytesSent }) }, - qRecordingRuleApplicationTCPBytesReceived: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationTCPBytesReceived: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { return aggConnections(w, func(c *model.Connection) *timeseries.TimeSeries { return c.BytesReceived }) }, - qRecordingRuleApplicationTCPRetransmissions: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationTCPRetransmissions: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { return aggConnections(w, func(c *model.Connection) *timeseries.TimeSeries { return c.Retransmissions }) }, - qRecordingRuleApplicationNetLatency: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationNetLatency: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { var res []*model.MetricValues for _, app := range w.Applications { @@ -414,7 +449,7 @@ var RecordingRules = map[string]func(p *db.Project, w *model.World) []*model.Met return res }, - qRecordingRuleApplicationL7Requests: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationL7Requests: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { var res []*model.MetricValues type key struct { status string @@ -456,92 +491,7 @@ var RecordingRules = map[string]func(p *db.Project, w *model.World) []*model.Met return res }, - qRecordingRuleInstanceL7Requests: func(p *db.Project, w *model.World) []*model.MetricValues { - var res []*model.MetricValues - type requests struct { - ok *timeseries.Aggregate - failed *timeseries.Aggregate - } - - instances := map[*model.Instance]*requests{} - for _, app := range w.Applications { - for _, instance := range app.Instances { - for _, u := range instance.Upstreams { - if u.RemoteInstance == nil { - continue - } - reqs := instances[u.RemoteInstance] - if reqs == nil { - reqs = &requests{ - ok: timeseries.NewAggregate(timeseries.NanSum), - failed: timeseries.NewAggregate(timeseries.NanSum), - } - instances[u.RemoteInstance] = reqs - } - for _, byStatus := range u.RequestsCount { - for status, ts := range byStatus { - if model.IsRequestStatusFailed(status) { - reqs.failed.Add(ts) - } else { - reqs.ok.Add(ts) - } - } - } - } - } - } - for instance, reqs := range instances { - id := instance.NodeContainerID() - if id == nil { - continue - } - if ts := reqs.ok.Get(); !ts.IsEmpty() { - ls := model.Labels{model.LabelContainerId: id.ContainerId, model.LabelMachineId: id.MachineID, model.LabelSystemUuid: id.SystemUUID} - res = append(res, &model.MetricValues{Labels: ls, LabelsHash: promModel.LabelsToSignature(ls), Values: ts}) - } - if ts := reqs.failed.Get(); !ts.IsEmpty() { - ls := model.Labels{model.LabelContainerId: id.ContainerId, model.LabelMachineId: id.MachineID, model.LabelSystemUuid: id.SystemUUID, "failed": "1"} - res = append(res, &model.MetricValues{Labels: ls, LabelsHash: promModel.LabelsToSignature(ls), Values: ts}) - } - } - return res - }, - - qRecordingRuleInstanceL7Latency: func(p *db.Project, w *model.World) []*model.MetricValues { - var res []*model.MetricValues - - instances := map[*model.Instance]*timeseries.Aggregate{} - for _, app := range w.Applications { - for _, instance := range app.Instances { - for _, u := range instance.Upstreams { - if u.RemoteInstance == nil { - continue - } - agg := instances[u.RemoteInstance] - if agg == nil { - agg = timeseries.NewAggregate(timeseries.NanSum) - instances[u.RemoteInstance] = agg - } - for _, ts := range u.RequestsLatency { - agg.Add(ts) - } - } - } - } - for instance, agg := range instances { - id := instance.NodeContainerID() - if id == nil { - continue - } - if ts := agg.Get(); !ts.IsEmpty() { - ls := model.Labels{model.LabelContainerId: id.ContainerId, model.LabelMachineId: id.MachineID, model.LabelSystemUuid: id.SystemUUID} - res = append(res, &model.MetricValues{Labels: ls, LabelsHash: promModel.LabelsToSignature(ls), Values: ts}) - } - } - return res - }, - - qRecordingRuleApplicationL7Latency: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationL7Latency: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { var res []*model.MetricValues type key struct { protocol model.Protocol @@ -580,7 +530,7 @@ var RecordingRules = map[string]func(p *db.Project, w *model.World) []*model.Met return res }, - qRecordingRuleApplicationL7Histogram: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationL7Histogram: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { var res []*model.MetricValues type key struct { le float32 @@ -621,36 +571,53 @@ var RecordingRules = map[string]func(p *db.Project, w *model.World) []*model.Met return res }, - qRecordingRuleApplicationTraffic: func(p *db.Project, w *model.World) []*model.MetricValues { + qRecordingRuleApplicationTraffic: func(db *db.DB, p *db.Project, w *model.World) []*model.MetricValues { var res []*model.MetricValues for _, app := range w.Applications { - stats := trafficStats{ - InternetEgress: timeseries.NewAggregate(timeseries.NanSum), - CrossAZEgress: timeseries.NewAggregate(timeseries.NanSum), - CrossAZIngress: timeseries.NewAggregate(timeseries.NanSum), - } - for _, instance := range app.Instances { - for _, u := range instance.Upstreams { - stats.update(instance, u) - } - } appId := app.Id.String() - if ts := stats.InternetEgress.Get(); !ts.IsEmpty() { + if ts := app.TrafficStats.InternetEgress.Get(); !ts.IsEmpty() { ls := model.Labels{"app": appId, "kind": string(model.TrafficKindInternetEgress)} res = append(res, &model.MetricValues{Labels: ls, LabelsHash: promModel.LabelsToSignature(ls), Values: ts}) } - if ts := stats.CrossAZEgress.Get(); !ts.IsEmpty() { + if ts := app.TrafficStats.CrossAZEgress.Get(); !ts.IsEmpty() { ls := model.Labels{"app": appId, "kind": string(model.TrafficKindCrossAZEgress)} res = append(res, &model.MetricValues{Labels: ls, LabelsHash: promModel.LabelsToSignature(ls), Values: ts}) } - if ts := stats.CrossAZIngress.Get(); !ts.IsEmpty() { + if ts := app.TrafficStats.CrossAZIngress.Get(); !ts.IsEmpty() { ls := model.Labels{"app": appId, "kind": string(model.TrafficKindCrossAZIngress)} res = append(res, &model.MetricValues{Labels: ls, LabelsHash: promModel.LabelsToSignature(ls), Values: ts}) } } return res }, + + qRecordingRuleApplicationCategories: func(database *db.DB, p *db.Project, w *model.World) []*model.MetricValues { + var needSave bool + for _, app := range w.Applications { + if _, ok := p.Settings.ApplicationCategorySettings[app.Category]; !ok { + if p.Settings.ApplicationCategorySettings == nil { + p.Settings.ApplicationCategorySettings = map[model.ApplicationCategory]*db.ApplicationCategorySettings{} + } + p.Settings.ApplicationCategorySettings[app.Category] = nil + needSave = true + } + } + if needSave { + if err := database.SaveProjectSettings(p); err != nil { + klog.Errorln("failed to save project settings:", err) + } + } + return nil + }, + + qRecordingRuleApplicationSLO: func(database *db.DB, p *db.Project, w *model.World) []*model.MetricValues { + for _, app := range w.Applications { + updateAvailabilitySLOFromAnnotations(database, p, w, app) + updateLatencySLOFromAnnotations(database, p, w, app) + } + return nil + }, } func aggConnections(w *model.World, tsF func(c *model.Connection) *timeseries.TimeSeries) []*model.MetricValues { @@ -683,39 +650,92 @@ func aggConnections(w *model.World, tsF func(c *model.Connection) *timeseries.Ti return res } -type trafficStats struct { - InternetEgress *timeseries.Aggregate - CrossAZEgress *timeseries.Aggregate - CrossAZIngress *timeseries.Aggregate +func updateAvailabilitySLOFromAnnotations(database *db.DB, p *db.Project, w *model.World, app *model.Application) { + objectiveStr := app.GetAnnotation(model.ApplicationAnnotationSLOAvailabilityObjective) + cfg, _ := w.CheckConfigs.GetAvailability(app.Id) + if objectiveStr == "" { + return + } + cfgSaved := cfg + cfg.Source = model.CheckConfigSourceKubernetesAnnotations + cfg.Custom = false + cfg.Error = "" + objective, err := parseObjective(objectiveStr) + if err != nil { + cfg.Error = fmt.Sprintf("Invalid annotation 'coroot.com/slo-availability-objective': %s", err) + } + if cfg.Error != "" { + cfg.ObjectivePercentage = 0 // disable + } else { + cfg.ObjectivePercentage = objective + } + if cfg == cfgSaved { + return + } + if err = database.SaveCheckConfig(p.Id, app.Id, model.Checks.SLOAvailability.Id, []model.CheckConfigSLOAvailability{cfg}); err != nil { + klog.Errorln(err) + } } -func (ts *trafficStats) update(instance *model.Instance, u *model.Connection) { - if u.RemoteInstance == nil || instance.Node == nil { +func updateLatencySLOFromAnnotations(database *db.DB, p *db.Project, w *model.World, app *model.Application) { + objectiveStr := app.GetAnnotation(model.ApplicationAnnotationSLOLatencyObjective) + thresholdStr := app.GetAnnotation(model.ApplicationAnnotationSLOLatencyThreshold) + if objectiveStr == "" && thresholdStr == "" { return } - srcRegion := instance.Node.Region.Value() - if srcRegion == "" { - return + cfg, _ := w.CheckConfigs.GetLatency(app.Id, app.Category) + cfgSaved := cfg + cfg.Source = model.CheckConfigSourceKubernetesAnnotations + cfg.Custom = false + cfg.Error = "" + var err error + objective := cfg.ObjectivePercentage + threshold := cfg.ObjectiveBucket + if objectiveStr != "" { + objective, err = parseObjective(objectiveStr) + if err != nil { + cfg.Error = fmt.Sprintf("Invalid annotation 'coroot.com/slo-latency-objective': %s", err) + } } - if u.RemoteInstance.Node != nil { - dstRegion := u.RemoteInstance.Node.Region.Value() - srcAZ := instance.Node.AvailabilityZone.Value() - dstAZ := u.RemoteInstance.Node.AvailabilityZone.Value() - if dstRegion != "" && dstRegion == srcRegion && srcAZ != "" && dstAZ != "" { - if srcAZ == dstAZ { - return - } else { - ts.CrossAZEgress.Add(u.BytesSent) - ts.CrossAZIngress.Add(u.BytesReceived) - return - } + if objective > 0 && thresholdStr != "" { + threshold, err = parseThreshold(thresholdStr) + if err != nil && cfg.Error == "" { + cfg.Error = fmt.Sprintf("Invalid annotation 'coroot.com/slo-latency-threshold': %s", err) } } - ip, err := netaddr.ParseIP(u.ActualRemoteIP) - switch { - case err != nil: - ts.InternetEgress.Add(u.BytesSent) //fqdn - case utils.IsIpExternal(ip): - ts.InternetEgress.Add(u.BytesSent) + if cfg.Error != "" { + cfg.ObjectivePercentage = 0 // disable + } else { + cfg.ObjectivePercentage = objective + cfg.ObjectiveBucket = threshold + } + if cfg == cfgSaved { + return + } + if err = database.SaveCheckConfig(p.Id, app.Id, model.Checks.SLOLatency.Id, []model.CheckConfigSLOLatency{cfg}); err != nil { + klog.Errorln(err) + } +} + +func hasNotEmptyLabel(metricName string, labelNames []string) string { + var parts []string + for _, labelName := range labelNames { + parts = append(parts, fmt.Sprintf(`%s{%s != ""}`, metricName, labelName)) + } + return strings.Join(parts, " or ") +} + +func parseObjective(s string) (float32, error) { + s = strings.TrimSpace(strings.TrimRight(strings.TrimSpace(s), "%")) + v, err := strconv.ParseFloat(s, 32) + return float32(v), err +} + +func parseThreshold(s string) (float32, error) { + d, err := time.ParseDuration(strings.TrimSpace(s)) + if err != nil { + return 0, err } + v := model.RoundUpToDefaultBucket(float32(d.Seconds())) + return v, err } diff --git a/constructor/rds.go b/constructor/rds.go index 1d961a7c4..0c07f3f01 100644 --- a/constructor/rds.go +++ b/constructor/rds.go @@ -144,7 +144,7 @@ func (c *Constructor) loadRds(w *model.World, metrics map[string][]*model.Metric } if c.pricing != nil { for _, instance := range rdsInstancesById { - instance.Node.Price = c.pricing.GetNodePrice(instance.Node) + instance.Node.Price = c.pricing.GetNodePrice(nil, instance.Node) } } } diff --git a/db/application_categories.go b/db/application_categories.go new file mode 100644 index 000000000..a835d7832 --- /dev/null +++ b/db/application_categories.go @@ -0,0 +1,354 @@ +package db + +import ( + "fmt" + "strings" + + "github.com/coroot/coroot/model" + "github.com/coroot/coroot/utils" + "golang.org/x/exp/maps" +) + +type ApplicationCategory struct { + Name model.ApplicationCategory `json:"name"` + Builtin bool `json:"builtin"` + Default bool `json:"default"` + BuiltinPatterns string `json:"builtin_patterns"` + CustomPatterns string `json:"custom_patterns"` + NotificationSettings ApplicationCategoryNotificationSettings `json:"notification_settings"` +} + +type ApplicationCategorySettings struct { + CustomPatterns []string `json:"custom_patterns,omitempty" yaml:"customPatterns,omitempty"` + NotifyOfDeployments bool `json:"notify_of_deployments,omitempty"` // deprecated: use NotificationSettings + NotificationSettings ApplicationCategoryNotificationSettings `json:"notification_settings,omitempty" yaml:"notificationSettings,omitempty"` +} + +type ApplicationCategoryNotificationSettings struct { + Incidents ApplicationCategoryIncidentNotificationSettings `json:"incidents,omitempty" yaml:"incidents,omitempty"` + Deployments ApplicationCategoryDeploymentNotificationSettings `json:"deployments,omitempty" yaml:"deployments,omitempty"` +} + +type ApplicationCategoryIncidentNotificationSettings struct { + Enabled bool `json:"enabled" yaml:"enabled"` + ApplicationCategoryNotificationDestinations `yaml:",inline"` +} + +type ApplicationCategoryDeploymentNotificationSettings struct { + Enabled bool `json:"enabled" yaml:"enabled"` + ApplicationCategoryNotificationDestinations `yaml:",inline"` +} + +type ApplicationCategoryNotificationDestinations struct { + Slack *ApplicationCategoryNotificationSettingsSlack `json:"slack,omitempty" yaml:"slack,omitempty"` + Teams *ApplicationCategoryNotificationSettingsTeams `json:"teams,omitempty" yaml:"teams,omitempty"` + Pagerduty *ApplicationCategoryNotificationSettingsPagerduty `json:"pagerduty,omitempty" yaml:"pagerduty,omitempty"` + Opsgenie *ApplicationCategoryNotificationSettingsOpsgenie `json:"opsgenie,omitempty" yaml:"opsgenie,omitempty"` + Webhook *ApplicationCategoryNotificationSettingsWebhook `json:"webhook,omitempty" yaml:"webhook,omitempty"` +} + +func (s ApplicationCategoryNotificationDestinations) hasEnabled() bool { + return (s.Slack != nil && s.Slack.Enabled) || + (s.Teams != nil && s.Teams.Enabled) || + (s.Pagerduty != nil && s.Pagerduty.Enabled) || + (s.Opsgenie != nil && s.Opsgenie.Enabled) || + (s.Webhook != nil && s.Webhook.Enabled) +} + +type ApplicationCategoryNotificationSettingsSlack struct { + Enabled bool `json:"enabled" yaml:"enabled"` + Channel string `json:"channel" yaml:"channel"` +} + +type ApplicationCategoryNotificationSettingsTeams struct { + Enabled bool `json:"enabled" yaml:"enabled"` +} + +type ApplicationCategoryNotificationSettingsPagerduty struct { + Enabled bool `json:"enabled" yaml:"enabled"` +} + +type ApplicationCategoryNotificationSettingsOpsgenie struct { + Enabled bool `json:"enabled" yaml:"enabled"` +} + +type ApplicationCategoryNotificationSettingsWebhook struct { + Enabled bool `json:"enabled" yaml:"enabled"` +} + +func (p *Project) CalcApplicationCategory(appId model.ApplicationId) model.ApplicationCategory { + id := fmt.Sprintf("%s/%s", appId.Namespace, appId.Name) + + settings := p.Settings.ApplicationCategorySettings + names := maps.Keys(settings) + utils.SortSlice(names) + for _, name := range names { + if s := settings[name]; s == nil || len(s.CustomPatterns) == 0 { + continue + } else if utils.GlobMatch(id, s.CustomPatterns...) { + return name + } + } + + names = maps.Keys(model.BuiltinCategoryPatterns) + utils.SortSlice(names) + for _, name := range names { + if utils.GlobMatch(id, model.BuiltinCategoryPatterns[name]...) { + return name + } + } + + return model.ApplicationCategoryApplication +} + +func (p *Project) GetApplicationCategories() map[model.ApplicationCategory]*ApplicationCategory { + res := map[model.ApplicationCategory]*ApplicationCategory{} + for c, settings := range p.Settings.ApplicationCategorySettings { + if c.Builtin() { + continue + } + category := &ApplicationCategory{ + Name: c, + } + if settings != nil { + category.CustomPatterns = strings.Join(settings.CustomPatterns, " ") + } + res[c] = category + } + for c, patterns := range model.BuiltinCategoryPatterns { + category := &ApplicationCategory{ + Name: c, + Builtin: true, + Default: c.Default(), + BuiltinPatterns: strings.Join(patterns, " "), + } + if settings := p.Settings.ApplicationCategorySettings[c]; settings != nil { + category.CustomPatterns = strings.Join(settings.CustomPatterns, " ") + } + res[c] = category + } + + for _, category := range res { + categorySettings := p.Settings.ApplicationCategorySettings[category.Name] + if categorySettings == nil { + categorySettings = &ApplicationCategorySettings{} + } + category.NotificationSettings = categorySettings.NotificationSettings + notifyOfDeployments := category.Default || categorySettings.NotifyOfDeployments + + { + integrationSlack := p.Settings.Integrations.Slack + if integrationSlack != nil { + if integrationSlack.Incidents { + if category.NotificationSettings.Incidents.Slack == nil { + category.NotificationSettings.Incidents.Enabled = true + category.NotificationSettings.Incidents.Slack = &ApplicationCategoryNotificationSettingsSlack{Enabled: true} + } + if category.NotificationSettings.Incidents.Slack.Channel == "" { + category.NotificationSettings.Incidents.Slack.Channel = integrationSlack.DefaultChannel + } + } + if integrationSlack.Deployments { + if category.NotificationSettings.Deployments.Slack == nil { + category.NotificationSettings.Deployments.Enabled = notifyOfDeployments + category.NotificationSettings.Deployments.Slack = &ApplicationCategoryNotificationSettingsSlack{Enabled: notifyOfDeployments} + } + if category.NotificationSettings.Deployments.Slack.Channel == "" { + category.NotificationSettings.Deployments.Slack.Channel = integrationSlack.DefaultChannel + } + } + } + if integrationSlack == nil || !integrationSlack.Incidents { + category.NotificationSettings.Incidents.Slack = nil + } + if integrationSlack == nil || !integrationSlack.Deployments { + category.NotificationSettings.Deployments.Slack = nil + } + } + { + integrationTeams := p.Settings.Integrations.Teams + if integrationTeams != nil { + if integrationTeams.Incidents { + if category.NotificationSettings.Incidents.Teams == nil { + category.NotificationSettings.Incidents.Enabled = true + category.NotificationSettings.Incidents.Teams = &ApplicationCategoryNotificationSettingsTeams{Enabled: true} + } + } + if integrationTeams.Deployments { + if category.NotificationSettings.Deployments.Teams == nil { + category.NotificationSettings.Deployments.Enabled = notifyOfDeployments + category.NotificationSettings.Deployments.Teams = &ApplicationCategoryNotificationSettingsTeams{Enabled: notifyOfDeployments} + } + } + } + if integrationTeams == nil || !integrationTeams.Incidents { + category.NotificationSettings.Incidents.Teams = nil + } + if integrationTeams == nil || !integrationTeams.Deployments { + category.NotificationSettings.Deployments.Teams = nil + } + } + { + integrationWebhook := p.Settings.Integrations.Webhook + if integrationWebhook != nil { + if integrationWebhook.Incidents { + if category.NotificationSettings.Incidents.Webhook == nil { + category.NotificationSettings.Incidents.Enabled = true + category.NotificationSettings.Incidents.Webhook = &ApplicationCategoryNotificationSettingsWebhook{Enabled: true} + } + } + if integrationWebhook.Deployments { + if category.NotificationSettings.Deployments.Webhook == nil { + category.NotificationSettings.Deployments.Enabled = notifyOfDeployments + category.NotificationSettings.Deployments.Webhook = &ApplicationCategoryNotificationSettingsWebhook{Enabled: notifyOfDeployments} + } + } + } + if integrationWebhook == nil || !integrationWebhook.Incidents { + category.NotificationSettings.Incidents.Webhook = nil + } + if integrationWebhook == nil || !integrationWebhook.Deployments { + category.NotificationSettings.Deployments.Webhook = nil + } + } + { + integrationPagerduty := p.Settings.Integrations.Pagerduty + if integrationPagerduty != nil { + if integrationPagerduty.Incidents { + category.NotificationSettings.Incidents.Enabled = true + if category.NotificationSettings.Incidents.Pagerduty == nil { + category.NotificationSettings.Incidents.Pagerduty = &ApplicationCategoryNotificationSettingsPagerduty{Enabled: true} + } + } + } + if integrationPagerduty == nil || !integrationPagerduty.Incidents { + category.NotificationSettings.Incidents.Pagerduty = nil + } + } + { + integrationOpsgenie := p.Settings.Integrations.Opsgenie + if integrationOpsgenie != nil { + if integrationOpsgenie.Incidents { + category.NotificationSettings.Incidents.Enabled = true + if category.NotificationSettings.Incidents.Opsgenie == nil { + category.NotificationSettings.Incidents.Opsgenie = &ApplicationCategoryNotificationSettingsOpsgenie{Enabled: true} + } + } + } + if integrationOpsgenie == nil || !integrationOpsgenie.Incidents { + category.NotificationSettings.Incidents.Opsgenie = nil + } + } + + if !category.NotificationSettings.Incidents.hasEnabled() { + category.NotificationSettings.Incidents.Enabled = false + } + if !category.NotificationSettings.Deployments.hasEnabled() { + category.NotificationSettings.Deployments.Enabled = false + } + } + + return res +} + +func (p *Project) NewApplicationCategory() *ApplicationCategory { + category := &ApplicationCategory{} + if slack := p.Settings.Integrations.Slack; slack != nil { + if slack.Incidents { + category.NotificationSettings.Incidents.Slack = &ApplicationCategoryNotificationSettingsSlack{Channel: slack.DefaultChannel} + } + if slack.Deployments { + category.NotificationSettings.Deployments.Slack = &ApplicationCategoryNotificationSettingsSlack{Channel: slack.DefaultChannel} + } + } + if teams := p.Settings.Integrations.Teams; teams != nil { + if teams.Incidents { + category.NotificationSettings.Incidents.Teams = &ApplicationCategoryNotificationSettingsTeams{} + } + if teams.Deployments { + category.NotificationSettings.Deployments.Teams = &ApplicationCategoryNotificationSettingsTeams{} + } + } + if webhook := p.Settings.Integrations.Webhook; webhook != nil { + if webhook.Incidents { + category.NotificationSettings.Incidents.Webhook = &ApplicationCategoryNotificationSettingsWebhook{} + } + if webhook.Deployments { + category.NotificationSettings.Deployments.Webhook = &ApplicationCategoryNotificationSettingsWebhook{} + } + } + if pagerduty := p.Settings.Integrations.Pagerduty; pagerduty != nil { + if pagerduty.Incidents { + category.NotificationSettings.Incidents.Pagerduty = &ApplicationCategoryNotificationSettingsPagerduty{} + } + } + if opsgenie := p.Settings.Integrations.Opsgenie; opsgenie != nil { + if opsgenie.Incidents { + category.NotificationSettings.Incidents.Opsgenie = &ApplicationCategoryNotificationSettingsOpsgenie{} + } + } + return category +} + +func (db *DB) SaveApplicationCategory(project *Project, name model.ApplicationCategory, category *ApplicationCategory) error { + settings := project.Settings.ApplicationCategorySettings + if settings == nil { + settings = map[model.ApplicationCategory]*ApplicationCategorySettings{} + project.Settings.ApplicationCategorySettings = settings + } + + if category == nil { // delete + if !name.Builtin() { + delete(settings, name) + return db.SaveProjectSettings(project) + } + return nil + } + + if name != category.Name && (name.Builtin() || settings[category.Name] != nil) { + return ErrConflict + } + + if !name.Builtin() && category.Name != name { + delete(settings, name) + } + categorySettings := settings[category.Name] + if categorySettings == nil { + categorySettings = &ApplicationCategorySettings{} + settings[category.Name] = categorySettings + } + if !category.Name.Default() { + categorySettings.CustomPatterns = strings.Fields(category.CustomPatterns) + } + categorySettings.NotificationSettings = category.NotificationSettings + if slack := categorySettings.NotificationSettings.Incidents.Slack; slack != nil { + if s := project.Settings.Integrations.Slack; s != nil && slack.Channel == s.DefaultChannel { + slack.Channel = "" + } + } + if slack := categorySettings.NotificationSettings.Deployments.Slack; slack != nil { + if s := project.Settings.Integrations.Slack; s != nil && slack.Channel == s.DefaultChannel { + slack.Channel = "" + } + } + + return db.SaveProjectSettings(project) +} + +func (db *DB) migrateApplicationCategories(p *Project) error { + if p.Settings.ApplicationCategories == nil { + return nil + } + if p.Settings.ApplicationCategorySettings == nil { + p.Settings.ApplicationCategorySettings = map[model.ApplicationCategory]*ApplicationCategorySettings{} + } + for name, patterns := range p.Settings.ApplicationCategories { + if settings := p.Settings.ApplicationCategorySettings[name]; settings != nil && len(settings.CustomPatterns) == 0 { + settings.CustomPatterns = patterns + } else { + p.Settings.ApplicationCategorySettings[name] = &ApplicationCategorySettings{CustomPatterns: patterns} + } + } + //p.Settings.ApplicationCategories = nil + return db.SaveProjectSettings(p) +} diff --git a/db/check_configs.go b/db/check_configs.go index f77e0f4ce..939f1a98d 100644 --- a/db/check_configs.go +++ b/db/check_configs.go @@ -77,6 +77,9 @@ func (db *DB) SaveCheckConfig(projectId ProjectId, appId model.ApplicationId, ch return err } res, err := db.db.Exec("UPDATE check_configs SET configs = $1 WHERE project_id = $2 AND application_id = $3", string(data), projectId, appIdStr) + if err != nil { + return err + } rowsAffected, _ := res.RowsAffected() if rowsAffected == 0 { if _, err := db.db.Exec("INSERT INTO check_configs (project_id, application_id, configs) VALUES ($1, $2, $3)", projectId, appIdStr, string(data)); err != nil { diff --git a/db/custom_cloud_pricing.go b/db/custom_cloud_pricing.go new file mode 100644 index 000000000..a93850dfd --- /dev/null +++ b/db/custom_cloud_pricing.go @@ -0,0 +1,13 @@ +package db + +type CustomCloudPricing struct { + Default bool `json:"default"` + PerCPUCore float32 `json:"per_cpu_core"` + PerMemoryGb float32 `json:"per_memory_gb"` +} + +var defaultCustomCloudPricing = CustomCloudPricing{ //on-demand pricing for GCP (C4 machine family, us-central1) + Default: true, + PerCPUCore: 0.03465, + PerMemoryGb: 0.003938, +} diff --git a/db/dashboards.go b/db/dashboards.go new file mode 100644 index 000000000..5de246b1c --- /dev/null +++ b/db/dashboards.go @@ -0,0 +1,136 @@ +package db + +import ( + "database/sql" + "encoding/json" + "errors" + + "github.com/coroot/coroot/utils" +) + +type Dashboards struct{} + +func (s *Dashboards) Migrate(m *Migrator) error { + return m.Exec(` + CREATE TABLE IF NOT EXISTS dashboards ( + project_id TEXT NOT NULL REFERENCES project(id), + id TEXT NOT NULL, + name TEXT NOT NULL, + description TEXT NOT NULL DEFAULT '', + config TEXT NOT NULL DEFAULT '{}', + PRIMARY KEY (project_id, id) + )`) +} + +type Dashboard struct { + Id string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + Config DashboardConfig `json:"config"` +} + +type DashboardConfig struct { + Groups []DashboardPanelGroup `json:"groups"` +} + +type DashboardPanelGroup struct { + Name string `json:"name"` + Panels []DashboardPanel `json:"panels"` + Collapsed bool `json:"collapsed"` +} + +type DashboardPanel struct { + Name string `json:"name"` + Description string `json:"description"` + Source DashboardPanelSource `json:"source"` + Widget DashboardPanelWidget `json:"widget"` + Box DashboardPanelBox `json:"box"` +} + +type DashboardPanelBox struct { + X int `json:"x"` + Y int `json:"y"` + W int `json:"w"` + H int `json:"h"` +} + +type DashboardPanelSource struct { + Metrics *DashboardPanelSourceMetrics `json:"metrics,omitempty"` +} + +type DashboardPanelSourceMetrics struct { + Queries []DashboardPanelSourceMetricsQuery `json:"queries"` +} + +type DashboardPanelSourceMetricsQuery struct { + Query string `json:"query"` + Legend string `json:"legend"` + Color string `json:"color"` +} + +type DashboardPanelWidget struct { + Chart *DashboardPanelChart `json:"chart,omitempty"` +} + +type DashboardPanelChart struct { + Display string `json:"display"` + Stacked bool `json:"stacked"` +} + +func (db *DB) GetDashboards(projectId ProjectId) ([]*Dashboard, error) { + rows, err := db.Query("SELECT id, name, description FROM dashboards WHERE project_id = $1", projectId) + if err != nil { + return nil, err + } + defer rows.Close() + var ds []*Dashboard + for rows.Next() { + var d Dashboard + if err = rows.Scan(&d.Id, &d.Name, &d.Description); err != nil { + return nil, err + } + ds = append(ds, &d) + } + return ds, nil +} + +func (db *DB) GetDashboard(projectId ProjectId, id string) (*Dashboard, error) { + d := &Dashboard{Id: id} + var config string + err := db.db.QueryRow("SELECT name, config FROM dashboards WHERE project_id = $1 AND id = $2", projectId, id).Scan(&d.Name, &config) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + return nil, err + } + if err = json.Unmarshal([]byte(config), &d.Config); err != nil { + return nil, err + } + return d, nil +} + +func (db *DB) CreateDashboard(projectId ProjectId, name, description string) (string, error) { + id := utils.NanoId(8) + _, err := db.db.Exec("INSERT INTO dashboards (project_id, id, name, description) VALUES ($1, $2, $3, $4)", projectId, id, name, description) + return id, err +} + +func (db *DB) UpdateDashboard(projectId ProjectId, id, name, description string) error { + _, err := db.db.Exec("UPDATE dashboards SET name = $1, description=$2 WHERE project_id = $3 AND id = $4", name, description, projectId, id) + return err +} + +func (db *DB) SaveDashboardConfig(projectId ProjectId, id string, config DashboardConfig) error { + cfg, err := json.Marshal(config) + if err != nil { + return err + } + _, err = db.db.Exec("UPDATE dashboards SET config = $1 WHERE project_id = $2 AND id = $3", string(cfg), projectId, id) + return err +} + +func (db *DB) DeleteDashboard(projectId ProjectId, id string) error { + _, err := db.db.Exec("DELETE FROM dashboards WHERE project_id = $1 AND id = $2", projectId, id) + return err +} diff --git a/db/db.go b/db/db.go index b2f33c7ec..7d365d615 100644 --- a/db/db.go +++ b/db/db.go @@ -62,8 +62,16 @@ func (db *DB) DB() *sql.DB { return db.db } +func (db *DB) Exec(query string, args ...any) (sql.Result, error) { + return db.db.Exec(query, args...) +} + +func (db *DB) Query(query string, args ...any) (*sql.Rows, error) { + return db.db.Query(query, args...) +} + func (db *DB) Migrator() *Migrator { - return NewMigrator(db.typ, db.db) + return NewMigrator(db.typ, db) } func (db *DB) Migrate(extraTables ...Table) error { @@ -74,6 +82,7 @@ func (db *DB) Migrate(extraTables ...Table) error { &IncidentNotification{}, &ApplicationDeployment{}, &ApplicationSettings{}, + &Dashboards{}, &Setting{}, &User{}, } @@ -98,10 +107,10 @@ type Table interface { type Migrator struct { typ Type - db *sql.DB + db *DB } -func NewMigrator(t Type, db *sql.DB) *Migrator { +func NewMigrator(t Type, db *DB) *Migrator { return &Migrator{typ: t, db: db} } diff --git a/db/incident.go b/db/incident.go index 6890b60d2..b8a2a6084 100644 --- a/db/incident.go +++ b/db/incident.go @@ -2,7 +2,10 @@ package db import ( "database/sql" + "database/sql/driver" "errors" + "fmt" + "strings" "github.com/coroot/coroot/model" "github.com/coroot/coroot/timeseries" @@ -32,7 +35,7 @@ type IncidentNotification struct { ApplicationId model.ApplicationId IncidentKey string Status model.Status - Destination IntegrationType + Destination IncidentNotificationDestination Timestamp timeseries.Time SentAt timeseries.Time ExternalKey string @@ -55,6 +58,35 @@ func (n *IncidentNotification) Migrate(m *Migrator) error { `) } +type IncidentNotificationDestination struct { + IntegrationType IntegrationType + SlackChannel string +} + +func (d IncidentNotificationDestination) Value() (driver.Value, error) { + switch d.IntegrationType { + case IntegrationTypeSlack: + return fmt.Sprintf("%s:%s", d.IntegrationType, d.SlackChannel), nil + } + return fmt.Sprintf("%s", d.IntegrationType), nil +} + +func (d *IncidentNotificationDestination) Scan(src any) error { + *d = IncidentNotificationDestination{} + parts := strings.Split(src.(string), ":") + if len(parts) == 0 { + return nil + } + d.IntegrationType = IntegrationType(parts[0]) + if len(parts) > 1 { + switch d.IntegrationType { + case IntegrationTypeSlack: + d.SlackChannel = parts[1] + } + } + return nil +} + type IncidentNotificationDetails struct { Reports []IncidentNotificationDetailsReport `json:"reports"` } @@ -70,6 +102,9 @@ func (db *DB) GetIncidentByKey(projectId ProjectId, key string) (*model.Applicat err := db.db.QueryRow( "SELECT application_id, opened_at, resolved_at, severity FROM incident WHERE project_id = $1 AND key = $2 LIMIT 1", projectId, key).Scan(&i.ApplicationId, &i.OpenedAt, &i.ResolvedAt, &i.Severity) + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } return i, err } diff --git a/db/integrations.go b/db/integrations.go index eb5c2c031..7049cad3f 100644 --- a/db/integrations.go +++ b/db/integrations.go @@ -2,6 +2,8 @@ package db import ( "fmt" + "net/url" + "strings" "github.com/coroot/coroot/timeseries" "github.com/coroot/coroot/utils" @@ -21,26 +23,70 @@ const ( ) type Integrations struct { - BaseUrl string `json:"base_url"` - - Slack *IntegrationSlack `json:"slack,omitempty"` - Pagerduty *IntegrationPagerduty `json:"pagerduty,omitempty"` - Teams *IntegrationTeams `json:"teams,omitempty"` - Opsgenie *IntegrationOpsgenie `json:"opsgenie,omitempty"` - Webhook *IntegrationWebhook `json:"webhook,omitempty"` - Clickhouse *IntegrationClickhouse `json:"clickhouse,omitempty"` AWS *IntegrationAWS `json:"aws"` + + NotificationIntegrations +} + +type NotificationIntegrations struct { + Readonly bool `json:"readonly" yaml:"-"` + BaseUrl string `json:"base_url" yaml:"baseURL"` + + Slack *IntegrationSlack `json:"slack,omitempty" yaml:"slack,omitempty"` + Teams *IntegrationTeams `json:"teams,omitempty" yaml:"teams,omitempty"` + Pagerduty *IntegrationPagerduty `json:"pagerduty,omitempty" yaml:"pagerduty,omitempty"` + Opsgenie *IntegrationOpsgenie `json:"opsgenie,omitempty" yaml:"opsgenie,omitempty"` + Webhook *IntegrationWebhook `json:"webhook,omitempty" yaml:"webhook,omitempty"` +} + +func (i *NotificationIntegrations) Validate() error { + if i.BaseUrl == "" { + return fmt.Errorf("base url is required") + } + if _, err := url.Parse(i.BaseUrl); err != nil { + return fmt.Errorf("invalid base url") + } + i.BaseUrl = strings.TrimRight(i.BaseUrl, "/") + + if i.Slack != nil { + if err := i.Slack.Validate(); err != nil { + return fmt.Errorf("invalid slack configuration: %w", err) + } + } + if i.Teams != nil { + if err := i.Teams.Validate(); err != nil { + return fmt.Errorf("invalid teams configuration: %w", err) + } + } + if i.Pagerduty != nil { + if err := i.Pagerduty.Validate(); err != nil { + return fmt.Errorf("invalid pagerduty configuration: %w", err) + } + } + if i.Opsgenie != nil { + if err := i.Opsgenie.Validate(); err != nil { + return fmt.Errorf("invalid opsgenie configuration: %w", err) + } + } + if i.Webhook != nil { + if err := i.Webhook.Validate(); err != nil { + return fmt.Errorf("invalid webhook configuration: %w", err) + } + } + + return nil + } type IntegrationInfo struct { - Type IntegrationType - Configured bool - Incidents bool - Deployments bool - Title string - Details string + Type IntegrationType `json:"type"` + Configured bool `json:"configured"` + Incidents bool `json:"incidents"` + Deployments bool `json:"deployments"` + Title string `json:"title"` + Details string `json:"details"` } func (integrations Integrations) GetInfo() []IntegrationInfo { @@ -51,7 +97,7 @@ func (integrations Integrations) GetInfo() []IntegrationInfo { i.Configured = true i.Incidents = cfg.Incidents i.Deployments = cfg.Deployments - i.Details = fmt.Sprintf("channel: #%s", cfg.DefaultChannel) + i.Details = fmt.Sprintf("default channel: #%s", cfg.DefaultChannel) } res = append(res, i) @@ -117,39 +163,82 @@ type IntegrationClickhouse struct { } type IntegrationSlack struct { - Token string `json:"token"` - DefaultChannel string `json:"default_channel"` - Enabled bool `json:"enabled"` // deprecated: use Incidents and Deployments - Incidents bool `json:"incidents"` - Deployments bool `json:"deployments"` + Token string `json:"token" yaml:"token"` + DefaultChannel string `json:"default_channel" yaml:"defaultChannel"` + Incidents bool `json:"incidents" yaml:"incidents"` + Deployments bool `json:"deployments" yaml:"deployments"` +} + +func (i *IntegrationSlack) Validate() error { + if i.Token == "" { + return fmt.Errorf("token is required") + } + if i.DefaultChannel == "" { + return fmt.Errorf("default channel is required") + } + return nil } type IntegrationTeams struct { - WebhookUrl string `json:"webhook_url"` - Incidents bool `json:"incidents"` - Deployments bool `json:"deployments"` + WebhookUrl string `json:"webhook_url" yaml:"webhookURL"` + Incidents bool `json:"incidents" yaml:"incidents"` + Deployments bool `json:"deployments" yaml:"deployments"` +} + +func (i *IntegrationTeams) Validate() error { + if i.WebhookUrl == "" { + return fmt.Errorf("webhook url is required") + } + return nil } type IntegrationPagerduty struct { - IntegrationKey string `json:"integration_key"` - Incidents bool `json:"incidents"` + IntegrationKey string `json:"integration_key" yaml:"integrationKey"` + Incidents bool `json:"incidents" yaml:"incidents"` +} + +func (i *IntegrationPagerduty) Validate() error { + if i.IntegrationKey == "" { + return fmt.Errorf("integration key is required") + } + return nil } type IntegrationOpsgenie struct { - ApiKey string `json:"api_key"` - EUInstance bool `json:"eu_instance"` - Incidents bool `json:"incidents"` + ApiKey string `json:"api_key" yaml:"apiKey"` + EUInstance bool `json:"eu_instance" yaml:"euInstance"` + Incidents bool `json:"incidents" yaml:"incidents"` +} + +func (i *IntegrationOpsgenie) Validate() error { + if i.ApiKey == "" { + return fmt.Errorf("api key is required") + } + return nil } type IntegrationWebhook struct { - Url string `json:"url"` - TlsSkipVerify bool `json:"tls_skip_verify"` - BasicAuth *utils.BasicAuth `json:"basic_auth"` - CustomHeaders []utils.Header `json:"custom_headers"` - Incidents bool `json:"incidents"` - Deployments bool `json:"deployments"` - IncidentTemplate string `json:"incident_template"` - DeploymentTemplate string `json:"deployment_template"` + Url string `json:"url" yaml:"url"` + TlsSkipVerify bool `json:"tls_skip_verify" yaml:"tlsSkipVerify"` + BasicAuth *utils.BasicAuth `json:"basic_auth" yaml:"basicAuth"` + CustomHeaders []utils.Header `json:"custom_headers" yaml:"customHeaders"` + Incidents bool `json:"incidents" yaml:"incidents"` + Deployments bool `json:"deployments" yaml:"deployments"` + IncidentTemplate string `json:"incident_template" yaml:"incidentTemplate"` + DeploymentTemplate string `json:"deployment_template" yaml:"deploymentTemplate"` +} + +func (i *IntegrationWebhook) Validate() error { + if i.Url == "" { + return fmt.Errorf("url is required") + } + if i.Incidents && i.IncidentTemplate == "" { + return fmt.Errorf("incident template is required") + } + if i.Deployments && i.DeploymentTemplate == "" { + return fmt.Errorf("deployment template is required") + } + return nil } type IntegrationAWS struct { diff --git a/db/project.go b/db/project.go index fab51c01a..460f9620c 100644 --- a/db/project.go +++ b/db/project.go @@ -26,21 +26,25 @@ type Project struct { } type ProjectSettings struct { - ApplicationCategories map[model.ApplicationCategory][]string `json:"application_categories"` - ApplicationCategorySettings map[model.ApplicationCategory]ApplicationCategorySettings `json:"application_category_settings"` - Integrations Integrations `json:"integrations"` - CustomApplications map[string]model.CustomApplication `json:"custom_applications"` - ApiKeys []ApiKey `json:"api_keys"` - Configurable bool `json:"configurable"` + Readonly bool `json:"readonly"` + ApplicationCategories map[model.ApplicationCategory][]string `json:"application_categories,omitempty"` // deprecated: use ApplicationCategorySettings + ApplicationCategorySettings map[model.ApplicationCategory]*ApplicationCategorySettings `json:"application_category_settings"` + Integrations Integrations `json:"integrations"` + CustomApplications map[string]model.CustomApplication `json:"custom_applications"` + ApiKeys []ApiKey `json:"api_keys"` + CustomCloudPricing *CustomCloudPricing `json:"custom_cloud_pricing"` } -type ApplicationCategorySettings struct { - NotifyOfDeployments bool `json:"notify_of_deployments"` +type ApiKey struct { + Key string `json:"key" yaml:"key"` + Description string `json:"description" yaml:"description"` } -type ApiKey struct { - Key string `json:"key"` - Description string `json:"description"` +func (k *ApiKey) Validate() error { + if k.Key == "" { + return fmt.Errorf("key is required") + } + return nil } func (p *Project) Migrate(m *Migrator) error { @@ -53,9 +57,19 @@ func (p *Project) Migrate(m *Migrator) error { if err != nil { return err } - if err := m.AddColumnIfNotExists("project", "settings", "text"); err != nil { + if err = m.AddColumnIfNotExists("project", "settings", "text"); err != nil { + return err + } + + projects, err := m.db.GetProjects() + if err != nil { return err } + for _, project := range projects { + if err = m.db.migrateApplicationCategories(project); err != nil { + return err + } + } return nil } @@ -63,25 +77,14 @@ func (p *Project) applyDefaults() { if p.Prometheus.RefreshInterval == 0 { p.Prometheus.RefreshInterval = DefaultRefreshInterval } - if _, ok := p.Settings.ApplicationCategorySettings[model.ApplicationCategoryApplication]; !ok { - if p.Settings.ApplicationCategorySettings == nil { - p.Settings.ApplicationCategorySettings = map[model.ApplicationCategory]ApplicationCategorySettings{} - } - p.Settings.ApplicationCategorySettings[model.ApplicationCategoryApplication] = ApplicationCategorySettings{NotifyOfDeployments: true} - } - if cfg := p.Settings.Integrations.Slack; cfg != nil { - if !cfg.Incidents { - cfg.Incidents = cfg.Enabled - } - if !cfg.Deployments { - cfg.Deployments = cfg.Enabled - } + if p.Settings.CustomCloudPricing == nil { + p.Settings.CustomCloudPricing = &defaultCustomCloudPricing } } func (p *Project) GetCustomApplicationName(instance string) string { for customAppName, cfg := range p.Settings.CustomApplications { - if utils.GlobMatch(instance, cfg.InstancePattens...) { + if utils.GlobMatch(instance, cfg.InstancePatterns...) { return customAppName } } @@ -120,16 +123,16 @@ func (db *DB) GetProjects() ([]*Project, error) { var settings sql.NullString for rows.Next() { var p Project - if err := rows.Scan(&p.Id, &p.Name, &prometheus, &settings); err != nil { + if err = rows.Scan(&p.Id, &p.Name, &prometheus, &settings); err != nil { return nil, err } if prometheus.Valid { - if err := json.Unmarshal([]byte(prometheus.String), &p.Prometheus); err != nil { + if err = json.Unmarshal([]byte(prometheus.String), &p.Prometheus); err != nil { return nil, err } } if settings.Valid { - if err := json.Unmarshal([]byte(settings.String), &p.Settings); err != nil { + if err = json.Unmarshal([]byte(settings.String), &p.Settings); err != nil { return nil, err } } @@ -163,19 +166,20 @@ func (db *DB) GetProject(id ProjectId) (*Project, error) { p := Project{Id: id} var prometheus sql.NullString var settings sql.NullString - if err := db.db.QueryRow("SELECT name, prometheus, settings FROM project WHERE id = $1", id).Scan(&p.Name, &prometheus, &settings); err != nil { + err := db.db.QueryRow("SELECT name, prometheus, settings FROM project WHERE id = $1", id).Scan(&p.Name, &prometheus, &settings) + if err != nil { if errors.Is(err, sql.ErrNoRows) { return nil, ErrNotFound } return nil, err } if prometheus.Valid { - if err := json.Unmarshal([]byte(prometheus.String), &p.Prometheus); err != nil { + if err = json.Unmarshal([]byte(prometheus.String), &p.Prometheus); err != nil { return nil, err } } if settings.Valid { - if err := json.Unmarshal([]byte(settings.String), &p.Settings); err != nil { + if err = json.Unmarshal([]byte(settings.String), &p.Settings); err != nil { return nil, err } } @@ -213,22 +217,25 @@ func (db *DB) DeleteProject(id ProjectId) error { defer func() { _ = tx.Rollback() }() - if _, err := tx.Exec("DELETE FROM check_configs WHERE project_id = $1", id); err != nil { + if _, err = tx.Exec("DELETE FROM check_configs WHERE project_id = $1", id); err != nil { + return err + } + if _, err = tx.Exec("DELETE FROM incident_notification WHERE project_id = $1", id); err != nil { return err } - if _, err := tx.Exec("DELETE FROM incident_notification WHERE project_id = $1", id); err != nil { + if _, err = tx.Exec("DELETE FROM incident WHERE project_id = $1", id); err != nil { return err } - if _, err := tx.Exec("DELETE FROM incident WHERE project_id = $1", id); err != nil { + if _, err = tx.Exec("DELETE FROM application_deployment WHERE project_id = $1", id); err != nil { return err } - if _, err := tx.Exec("DELETE FROM application_deployment WHERE project_id = $1", id); err != nil { + if _, err = tx.Exec("DELETE FROM application_settings WHERE project_id = $1", id); err != nil { return err } - if _, err := tx.Exec("DELETE FROM application_settings WHERE project_id = $1", id); err != nil { + if _, err = tx.Exec("DELETE FROM dashboards WHERE project_id = $1", id); err != nil { return err } - if _, err := tx.Exec("DELETE FROM project WHERE id = $1", id); err != nil { + if _, err = tx.Exec("DELETE FROM project WHERE id = $1", id); err != nil { return err } return tx.Commit() @@ -243,49 +250,6 @@ func (db *DB) SaveProjectSettings(p *Project) error { return err } -func (db *DB) SaveApplicationCategory(id ProjectId, category, newName model.ApplicationCategory, customPatterns []string, notifyAboutDeployments bool) error { - p, err := db.GetProject(id) - if err != nil { - return err - } - - if p.Settings.ApplicationCategorySettings[category].NotifyOfDeployments != notifyAboutDeployments { - if p.Settings.ApplicationCategorySettings == nil { - p.Settings.ApplicationCategorySettings = map[model.ApplicationCategory]ApplicationCategorySettings{} - } - p.Settings.ApplicationCategorySettings[category] = ApplicationCategorySettings{NotifyOfDeployments: notifyAboutDeployments} - } - - if !category.Default() { - var patterns []string - for _, p := range customPatterns { - p = strings.TrimSpace(p) - if len(p) == 0 { - continue - } - patterns = append(patterns, p) - } - - if len(patterns) == 0 { // delete - delete(p.Settings.ApplicationCategories, category) - delete(p.Settings.ApplicationCategorySettings, category) - } else { - if p.Settings.ApplicationCategories == nil { - p.Settings.ApplicationCategories = map[model.ApplicationCategory][]string{} - } - if category != newName && !category.Builtin() { // rename - delete(p.Settings.ApplicationCategories, category) - p.Settings.ApplicationCategorySettings[newName] = p.Settings.ApplicationCategorySettings[category] - delete(p.Settings.ApplicationCategorySettings, category) - category = newName - } - p.Settings.ApplicationCategories[category] = patterns - } - } - - return db.SaveProjectSettings(p) -} - func (db *DB) SaveCustomApplication(id ProjectId, name, newName string, instancePatterns []string) error { p, err := db.GetProject(id) if err != nil { @@ -312,7 +276,7 @@ func (db *DB) SaveCustomApplication(id ProjectId, name, newName string, instance delete(p.Settings.CustomApplications, name) name = newName } - p.Settings.CustomApplications[name] = model.CustomApplication{InstancePattens: patterns} + p.Settings.CustomApplications[name] = model.CustomApplication{InstancePatterns: patterns} } return db.SaveProjectSettings(p) } diff --git a/deploy/docker-compose.yaml b/deploy/docker-compose.yaml index 25aea6a78..54aec560d 100644 --- a/deploy/docker-compose.yaml +++ b/deploy/docker-compose.yaml @@ -1,5 +1,3 @@ -version: '3.8' - name: coroot volumes: @@ -13,8 +11,9 @@ volumes: services: coroot: restart: always - image: ghcr.io/coroot/coroot + image: ghcr.io/coroot/coroot${LICENSE_KEY:+-ee} # set 'coroot-ee' as the image if LICENSE_KEY is defined pull_policy: always + user: root volumes: - coroot_data:/data ports: @@ -24,6 +23,8 @@ services: - '--bootstrap-prometheus-url=http://prometheus:9090' - '--bootstrap-refresh-interval=15s' - '--bootstrap-clickhouse-address=clickhouse:9000' + environment: + - LICENSE_KEY=${LICENSE_KEY:-} depends_on: - clickhouse - prometheus @@ -75,6 +76,8 @@ services: clickhouse: restart: always image: clickhouse/clickhouse-server:24.3 + environment: + CLICKHOUSE_SKIP_USER_SETUP: "1" volumes: - clickhouse_data:/var/lib/clickhouse - clickhouse_logs:/var/log/clickhouse-server diff --git a/deploy/docker-swarm-stack.yaml b/deploy/docker-swarm-stack.yaml index 9cc8a35b9..f14afa1a4 100644 --- a/deploy/docker-swarm-stack.yaml +++ b/deploy/docker-swarm-stack.yaml @@ -1,5 +1,3 @@ -version: "3.8" - volumes: prometheus_data: {} clickhouse_data: {} @@ -10,8 +8,9 @@ volumes: services: coroot: restart: always - image: ghcr.io/coroot/coroot + image: ghcr.io/coroot/coroot${LICENSE_KEY:+-ee} # set 'coroot-ee' as the image if LICENSE_KEY is defined pull_policy: always + user: root volumes: - coroot_data:/data ports: @@ -24,6 +23,8 @@ services: - '--bootstrap-prometheus-url=http://prometheus:9090' - '--bootstrap-refresh-interval=15s' - '--bootstrap-clickhouse-address=clickhouse:9000' + environment: + - LICENSE_KEY=${LICENSE_KEY:-} depends_on: - clickhouse - prometheus diff --git a/deploy/install.sh b/deploy/install.sh index cf577dd03..8f49f5e4b 100644 --- a/deploy/install.sh +++ b/deploy/install.sh @@ -6,7 +6,7 @@ if [ $(id -u) -eq 0 ]; then SUDO= fi -BIN_DIR=/usr/bin +BIN_DIR=/usr/local/bin SYSTEMD_DIR=/etc/systemd/system DATA_DIR=/var/lib/coroot @@ -17,6 +17,7 @@ SYSTEM_NAME= SYSTEM_DESCRIPTION= FILE_SERVICE= FILE_ENV= +REPO= ARGS= info() { @@ -95,7 +96,7 @@ get_release_version() { download_binary() { info "Downloading binary" - URL="${GITHUB_URL}/download/${VERSION}/${SYSTEM_NAME}-${ARCH}" + URL="${GITHUB_URL}/download/${VERSION}/${REPO}-${ARCH}" set +e case $DOWNLOADER in curl) @@ -175,7 +176,7 @@ create_env_file() { $SUDO chmod 0600 ${FILE_ENV} case $SYSTEM_NAME in coroot) - env_vars="LISTEN|URL_BASE_PATH|CACHE_TTL|CACHE_GC_INTERVAL|TRACES_TTL|LOGS_TTL|PROFILES_TTL|PG_CONNECTION_STRING|DISABLE_USAGE_STATISTICS|READ_ONLY|BOOTSTRAP_PROMETHEUS_URL|BOOTSTRAP_REFRESH_INTERVAL|BOOTSTRAP_PROMETHEUS_EXTRA_SELECTOR|DO_NOT_CHECK_SLO|DO_NOT_CHECK_FOR_DEPLOYMENTS|DO_NOT_CHECK_FOR_UPDATES|BOOTSTRAP_CLICKHOUSE_ADDRESS|BOOTSTRAP_CLICKHOUSE_USER|BOOTSTRAP_CLICKHOUSE_PASSWORD|BOOTSTRAP_CLICKHOUSE_DATABASE" + env_vars="LICENSE_KEY|LISTEN|URL_BASE_PATH|CACHE_TTL|CACHE_GC_INTERVAL|TRACES_TTL|LOGS_TTL|PROFILES_TTL|PG_CONNECTION_STRING|DISABLE_USAGE_STATISTICS|READ_ONLY|BOOTSTRAP_PROMETHEUS_URL|BOOTSTRAP_REFRESH_INTERVAL|BOOTSTRAP_PROMETHEUS_EXTRA_SELECTOR|DO_NOT_CHECK_SLO|DO_NOT_CHECK_FOR_DEPLOYMENTS|DO_NOT_CHECK_FOR_UPDATES|BOOTSTRAP_CLICKHOUSE_ADDRESS|BOOTSTRAP_CLICKHOUSE_USER|BOOTSTRAP_CLICKHOUSE_PASSWORD|BOOTSTRAP_CLICKHOUSE_DATABASE" sh -c export | while read x v; do echo $v; done | grep -E "^(${env_vars})" | $SUDO tee ${FILE_ENV} >/dev/null ;; coroot-cluster-agent) @@ -202,7 +203,7 @@ create_service_file() { $SUDO tee ${FILE_SERVICE} >/dev/null << EOF [Unit] Description=${SYSTEM_DESCRIPTION} -Documentation=https://coroot.com +Documentation=https://docs.coroot.com Wants=network-online.target After=network-online.target @@ -241,11 +242,12 @@ service_enable_and_start() { install() { SYSTEM_NAME=$1 SYSTEM_DESCRIPTION=$2 - ARGS=$3 + REPO=$3 + ARGS=$4 FILE_SERVICE=${SYSTEMD_DIR}/${SYSTEM_NAME}.service FILE_ENV=${FILE_SERVICE}.env - GITHUB_URL="https://github.com/coroot/${SYSTEM_NAME}/releases" + GITHUB_URL="https://github.com/coroot/${REPO}/releases" echo "*** INSTALLING ${SYSTEM_NAME} ***" download @@ -262,6 +264,10 @@ install() { create_uninstall - install coroot "Coroot" "--data-dir=${DATA_DIR}" - install coroot-cluster-agent "Coroot Cluster Agent" "--metrics-wal-dir=${DATA_DIR}" + if [ -n "$LICENSE_KEY" ]; then + install coroot "Coroot" coroot-ee "--data-dir=${DATA_DIR}" + else + install coroot "Coroot" coroot "--data-dir=${DATA_DIR}" + fi + install coroot-cluster-agent "Coroot Cluster Agent" coroot-cluster-agent "--metrics-wal-dir=${DATA_DIR}" } diff --git a/dev.dockerfile b/dev.dockerfile new file mode 100644 index 000000000..edbf9dd99 --- /dev/null +++ b/dev.dockerfile @@ -0,0 +1,45 @@ +FROM node:21 AS frontend-builder +WORKDIR /tmp/src +COPY . . +WORKDIR /tmp/src/front + +RUN npm ci +RUN npm run build-prod + + +FROM golang:1.23-bullseye AS backend-builder +ARG VERSION=unknown + +RUN apt update && apt install -y liblz4-dev +WORKDIR /tmp/src +COPY go.mod . +COPY go.sum . +RUN go mod download +COPY . . +WORKDIR /tmp/src/static +COPY --from=frontend-builder /tmp/src/static /tmp/src/static +WORKDIR /tmp/src +RUN go build -mod=readonly -ldflags "-X main.version=$VERSION" -o coroot . + + +FROM registry.access.redhat.com/ubi9/ubi + +ARG VERSION=unknown +LABEL name="coroot" \ + vendor="Coroot, Inc." \ + maintainer="Coroot, Inc." \ + version=${VERSION} \ + release="1" \ + summary="Coroot Community Edition." \ + description="Coroot Community Edition container image." + +COPY LICENSE /licenses/LICENSE + +COPY --from=backend-builder /tmp/src/coroot /usr/bin/coroot +RUN mkdir /data && chown 65534:65534 /data + +USER 65534:65534 +VOLUME /data +EXPOSE 8080 + +ENTRYPOINT ["/usr/bin/coroot"] diff --git a/docs/docs/ai/_category_.yaml b/docs/docs/ai/_category_.yaml new file mode 100644 index 000000000..93641ed00 --- /dev/null +++ b/docs/docs/ai/_category_.yaml @@ -0,0 +1,8 @@ +--- +label: AI-powered Root Cause Analysis +position: 5 +link: + type: generated-index + slug: /ai/ + + diff --git a/docs/docs/ai/configuration.md b/docs/docs/ai/configuration.md new file mode 100644 index 000000000..be242a831 --- /dev/null +++ b/docs/docs/ai/configuration.md @@ -0,0 +1,42 @@ +--- +sidebar_position: 2 +--- + +# Configuration + +:::info +AI-powered Root Cause Analysis is available only in Coroot Enterprise (from $1 per CPU core/month). [Start](https://coroot.com/account) your free trial today. +::: + +Coroot Enterprise Edition supports integration with multiple AI model providers: + +* Anthropic (Claude 3.7 Sonnet) – recommended, as it delivered the best results based on our tests +* OpenAI (GPT-4o) +* Any OpenAI-compatible API, such as DeepSeek or Google Gemini + +To set up an integration, go to **Project Settings** → **AI**. +This is a global setting that applies to all projects and requires the `settings.edit` permission. + +## Anthropic + +To integrate with Anthropic models, simply provide your API key. +Make sure your Coroot Enterprise instance can reach `api.anthropic.com:443`. + +Anthropic + +## OpenAI + +To integrate with OpenAI models, provide your API key. +Make sure your Coroot Enterprise instance can connect to `api.openai.com:443` + +OpenAI + +## OpenAI-compatible APIs + +Coroot also supports any API that is compatible with OpenAI. +We’ve tested integrations with providers like Google Gemini and DeepSeek. + +To configure this, provide your API key, set the base URL of your provider, and specify the model name you want to use. +Make sure your Coroot Enterprise instance can connect to the specified base URL. + +OpenAI-compatible API diff --git a/docs/docs/ai/overview.md b/docs/docs/ai/overview.md new file mode 100644 index 000000000..865aebb56 --- /dev/null +++ b/docs/docs/ai/overview.md @@ -0,0 +1,100 @@ +--- +sidebar_position: 1 +hide_table_of_contents: true +--- + +# Overview + +:::info +AI-powered Root Cause Analysis is available only in Coroot Enterprise (from $1 per CPU core/month). [Start](https://coroot.com/account) your free trial today. +::: + +Coroot’s AI-powered Root Cause Analysis helps teams quickly understand and fix incidents. + +It summarizes complex telemetry data, like metrics, logs, traces, and profiles, into clear and actionable insights. +Using Large Language Models (LLMs), Coroot automatically analyzes system behavior and explains what likely caused performance issues or outages. + +For example, we detected an anomaly in the service SLIs, such as latency spikes and some errors. You can select the anomaly and ask Coroot to explain what caused it. + +Anomaly in SLIs + +First, Coroot follows the dependency graph from the affected service. +It works like an engineer, checking possible causes by comparing telemetry data with the anomaly. +This step uses various ML (Machine Learning) algorithms and doesn’t involve LLMs. + +As a result, Coroot identifies the most likely root causes, relevant logs, and key signals that support each hypothesis. + +However, the output can still be quite detailed and may require some analysis. Without full context, it can be hard to interpret. +In many cases, even experienced engineers look for solutions in documentation or community forums. + +Detailed RCA report + +This is where the LLM helps. Using the rich context collected by Coroot, it summarizes the findings and suggests possible fixes or next steps. + +When you click the "**Explain with AI**" button, Coroot sends only its findings to the selected model, not all the raw telemetry data. +This way, we avoid relying on LLMs for the actual root cause analysis, where they are not very effective. +Instead, we use them for what they do best: explaining complex issues and summarizing results. + +Here’s the summary for this incident: + + +
+ +### Anomaly summary + +The _frontend_ service is experiencing significant performance degradation with latency spikes reaching up to 4.28s (p95) and 10s (p99), along with failed requests occurring at a rate of up to 0.067 per second. + +### Issue propagation paths + +The issue appears to follow these paths: + +* _frontend_ → _ad_ (high latency correlation) +* _ad_ → _otel-collector_ (increased latency) +* _frontend_ → _otel-collector_ (direct dependency also affected) + +### Key findings and Root Cause Analysis + +The root cause appears to be in the ad service (Java application): + +* Instances of _ad_ service have been terminated by the OOM killer, indicating memory exhaustion +* Before termination, the service experienced high JVM Safepoint time (up to 0.52s per second) +* Serial full collection GC pauses reached 0.52s per second, strongly correlating with the _frontend_ latency issues +* The _ad_ service's communication with otel-collector also shows increased latency (up to 427.5ms at p99) + +This suggests the ad service is experiencing severe memory pressure, leading to excessive garbage collection, +which is blocking request processing and eventually causing OOM crashes. + +### Remediation + +To resolve this issue: + +* Increase the heap memory allocation for the _ad_ service to prevent OOM situations +* Consider tuning the JVM garbage collection settings to reduce full GC pauses: + * Switch from Serial GC to G1GC or ZGC for better pause time management + * Adjust heap size ratios (young/old generation) +* Investigate potential memory leaks in the _ad_ service code +* Review the interaction between _ad_ service and _otel-collector_ to optimize telemetry data transmission +* Consider implementing circuit breakers in frontend to gracefully handle ad service failures + +### Relevant charts + +Relevant charts + +
+ +Coroot’s AI-powered Root Cause Analysis makes it easier to understand and fix issues by combining smart diagnostics +with clear AI explanations. + +In the next section, we’ll cover how to configure Coroot to work with different AI model providers like OpenAI and Anthropic. + + + + + + + + + + + + diff --git a/docs/docs/alerting/opsgenie.md b/docs/docs/alerting/opsgenie.md index ac5741229..1382c09ef 100644 --- a/docs/docs/alerting/opsgenie.md +++ b/docs/docs/alerting/opsgenie.md @@ -4,8 +4,9 @@ sidebar_position: 6 # Opsgenie -To configure a **Rest HTTP API** integration in your Opsgenie account: +## Configure Opsgenie +To configure a **Rest HTTP API** integration in your Opsgenie account: * Navigate to the **Teams** page * Choose a team (or create a new one) * Go to **Integrations** → **Add integration** and create an **API** integration @@ -16,7 +17,7 @@ To configure a **Rest HTTP API** integration in your Opsgenie account: * Copy the **API Key** Opsgenie API Key -On the Coroot side: +## Configure Coroot * Go to the **Project Settings** → **Integrations** * Create an Opsgenie integration @@ -24,8 +25,3 @@ On the Coroot side: Coroot Opsgenie Integration * You can also send a test alert to check the integration Coroot Opsgenie Test Alert - - - - - diff --git a/docs/docs/alerting/pagerduty.md b/docs/docs/alerting/pagerduty.md index 3054897dd..5ae79ffca 100644 --- a/docs/docs/alerting/pagerduty.md +++ b/docs/docs/alerting/pagerduty.md @@ -4,6 +4,8 @@ sidebar_position: 5 # Pagerduty +## Configure Pagerduty + To configure an **Events API V2** integration in your Pagerduty account: * Navigate to **Services** → **Service Directory** Pagerduty Services @@ -17,17 +19,11 @@ To configure an **Events API V2** integration in your Pagerduty account: Pagerduty Integration Key -On the Coroot side: +## Configure Coroot * Go to the **Project Settings** → **Integrations** * Create a Pagerduty integration * Paste the Integration Key to the form Coroot Pagerduty Integration - * You can also send a test alert to check the integration Coroot Pagerduty Test Alert - - - - - diff --git a/docs/docs/alerting/slack.md b/docs/docs/alerting/slack.md index 9b756e238..2358d935a 100644 --- a/docs/docs/alerting/slack.md +++ b/docs/docs/alerting/slack.md @@ -4,6 +4,8 @@ sidebar_position: 3 # Slack +## Configure Slack + If you want to receive alerts to your Slack channel, you’ll need to create a Slack App and make it available to Coroot. To configure a slack integration go to the **Project Settings** → **Integrations**. @@ -13,7 +15,7 @@ Click on **Create Slack app**. Coroot will open a new browser tab and send you o When you click on Create Slack app, Coroot will pass along the app manifest, which Slack will use to set up your app. :::info -You may get a warning that says: **This app is created from a 3rd party manifes**t. +You may get a warning that says: **This app is created from a 3rd party manifest**. This warning is expected (Coroot is the third party here). You can click on Configure to see the app manifest Coroot sent along in the URL. The manifest just take cares of some settings for your app and helps speed things along. ::: @@ -32,17 +34,14 @@ Then go to **OAuth and Permissions** and copy the **Bot User OAuth Token**. Slack Bot Token -On the Coroot side: +## Configure Coroot + * Go to the **Project settings** → **Integrations** * Create a Slack integration * Paste the token to the form Coroot Slack Integration - -* Coroot can send alerts into any public channel in your Slack workspace. Enter that channel’s name in the **Slack channel Name** field +* Coroot can send alerts into any public channel in your Slack workspace. + Specify the channel name in the **Default Slack channel name** field. + This channel will be used unless [overridden](/configuration/application-categories#notification-routing) by an application category's settings. * You can also send a test alert to check the integration - Coroot Slack Test Alert - - - - - + Coroot Slack Test Alert diff --git a/docs/docs/alerting/teams.md b/docs/docs/alerting/teams.md index 2d4fdf5cd..a683f29ec 100644 --- a/docs/docs/alerting/teams.md +++ b/docs/docs/alerting/teams.md @@ -4,30 +4,34 @@ sidebar_position: 4 # Microsoft Teams -To configure an **Incoming webhook connector** in your Microsoft Teams: -* Navigate to **Teams** -* Choose a channel (or create a new one) -* Select **...** from the top navigation menu and choose **Connectors** - Create an MS Teams incomming webhook connector - -* Search for **Incoming Webhook** and press the **Configure** button - Configure Incomming webhook connector - -* Provide a name for the webhook (e.g. _Coroot_), customize the image (you can use the [Coroot logo](https://coroot.com/static/img/coroot.png)), and click **Create** - Configure Incomming webhook connector -* Copy the webhook URL +## Configure Microsoft Teams -On the Coroot side: +To configure an **Incoming webhook with Workflows** in your Microsoft Teams: +* Choose a channel (or create a new one) +* Click **...** next to the channel and select **Workflows** + MS Teams integration step 1 +* Choose the **Post to a channel when a webhook request is received** workflow template + MS Teams integration step 2 +* Provide a name (e.g., _Coroot_) and click **Next** + MS Teams integration step 3 +* Click **Add workflow** + MS Teams integration step 4 +* Copy the workflow URL and click **Done** + MS Teams integration step 5 + +:::info +If you ever need to copy the workflow URL again, you’ll be able to find it by opening the Workflows app within Teams, +selecting the workflow that was created, selecting **Edit**, and expanding the trigger **When a Teams webhook request is received**. +::: +MS Teams integration step 5 +For more information, refer to the [Webhooks with Workflows for Microsoft Teams](https://support.microsoft.com/en-us/office/create-incoming-webhooks-with-workflows-for-microsoft-teams-8ae491c7-0394-4861-ba59-055e33f75498) documentation. + +## Configure Coroot * Go to the **Project Settings** → **Integrations** * Create an MS Teams integration -* Paste the webhook URL to the form +* Paste the workflow URL to the form MS Teams integration * You can also send a test alert to check the integration MS Teams Test Alert - - - - - diff --git a/docs/docs/alerting/webhook.md b/docs/docs/alerting/webhook.md index 0cf085009..889e8bc90 100644 --- a/docs/docs/alerting/webhook.md +++ b/docs/docs/alerting/webhook.md @@ -93,7 +93,7 @@ Deployment of {{ .Application.Name }}@{{ .Application.Namespace }} ``` -If the system you aim to integrate accepts JSON-formatted messages, you can employ the built-in json template function: +If the system you aim to integrate accepts JSON-formatted messages, you can employ the built-in `json` template function: ```gotemplate {{ json . }} @@ -162,6 +162,10 @@ Follow the steps below to create a Telegram Bot and obtain `chat_id`: * Now, you need to fetch the chat ID. You can do this using various methods: * Use a Telegram bot like `@userinfobot` or `@getidsbot`. Send the command `/getid` in the chat and it will reply with the chat ID. +Now build the link for webhook. It will look like `https://api.telegram.org/bot[botID]:[botToken]/sendMessage` and paste it into the Webhook URL + +Then proceed with filling templates. + Sample incident template: ```gotemplate diff --git a/docs/docs/configuration/application-categories.md b/docs/docs/configuration/application-categories.md index 43469ccca..1baf70ca3 100644 --- a/docs/docs/configuration/application-categories.md +++ b/docs/docs/configuration/application-categories.md @@ -6,17 +6,46 @@ sidebar_position: 9 Coroot allows you to organize your applications into custom groups called Application Categories. These act like scopes, helping you either hide certain applications or focus on specific ones more easily. +Additionally, Application Categories can be used for [notification routing](#notification-routing) — for example, to send alerts to different Slack channels based on category. + +Application Categories + +## Kubernetes annotations + +To define a category for a Kubernetes application (Deployment, StatefulSet, DaemonSet, or CronJob), +annotate it with the `coroot.com/application-category` annotation. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: some-app + namespace: default + annotations: + coroot.com/application-category: auxiliary +``` + +The application category can also be defined using Pod annotations. + +## Pattern-based configuration + +For non-Kubernetes applications, or in cases where setting annotations is not possible, +Coroot allows you to configure Application Categories manually by matching applications using patterns. + +:::info +Application categories defined via annotations take precedence over those configured manually. +::: To configure Application Categories, go to the **Project Settings**, click on **Applications**, and adjust the built-in categories or create your own custom ones. Each category is defined by a set of [glob patterns](https://en.wikipedia.org/wiki/Glob_(programming)) in the `/` format. -Coroot also includes several pre-defined categories, such as monitoring and control-plane. - +Coroot also includes several pre-defined categories, such as `monitoring` and `control-plane`. -Configuring Application Categories +Configuring Application Categories ## Quick links + To make organizing your apps easier, Coroot allows you to define the category for an app directly on the service map: Categories on Service Map @@ -25,3 +54,7 @@ To make organizing your apps easier, Coroot allows you to define the category fo Setting Categories from the Application page +## Notification routing + +A category can be configured to enable or disable notifications about incidents (SLO violations) and new deployments. +Setting Categories from the Application page diff --git a/docs/docs/configuration/authentication.md b/docs/docs/configuration/authentication.md index 1a381a32c..2bc550f71 100644 --- a/docs/docs/configuration/authentication.md +++ b/docs/docs/configuration/authentication.md @@ -65,20 +65,47 @@ This makes the login process easier and more secure by centralizing authenticati * Click Next. * On the General Settings tab, enter a name for your Coroot integration. You can also upload the [logo](https://coroot.com/static/img/coroot_512.png). Okta app - * On the Configure SAML tab: * For both Single sign on URL and Audience URI (SP Entity ID) fields use the https://COROOT_ADDRESS/sso/saml URL. SAML Okta params - * In the Attribute Statements section, configure `Email`, `FirstName`, and `LastName` attributes. Okta SAML attributes - * Click Next. * On the final Feedback tab, fill out the form and then click Finish. * Download **Identity Provider Metadata XML** using the `Metadata URL`: - Add user - -### Configure SAML authentication in Coroot + Okta SAML metadata +* [Configure and enable](#configure-saml-for-coroot) SAML authentication for Coroot. + +### Setup SAML with Keycloak + +* Log in to Keycloak as an administrator. +* Select **Clients**, then click **Create client**. + Keycloak client general settings +* Click **Next** and configure the **Home URL** and **Valid redirect URIs** fields. + Keycloak client login settings +* **Save** the client. +* Under the **Keys** tab, set **Client signature required** to **Off**. + Keycloak client keys settings +* Navigate to the **Client scopes** tab and click **http://<COROOT ADDRESS>/sso/saml-dedicated**. + Keycloak client scopes +* Click **Add predefined mapper**, select the **X500 email**, **X500 givenName**, and **X500 surname** attributes, and click **Add**. + Keycloak client mappers +* Configure attributes mapping. + :::info + Coroot expects to receive the following attributes: Email, FirstName, and LastName + ::: + Keycloak client mappers + * Click **X500 email** and set **SAML Attribute Name** to _Email_, and **SAML Attribute NameFormat** to _Basic_. + Keycloak client mappers Email + * Click **X500 givenName** and set **SAML Attribute Name** to _FirstName_, and **SAML Attribute NameFormat** to _Basic_. + Keycloak client mappers Email + * Click **X500 surname** and set **SAML Attribute Name** to _LastName_, and **SAML Attribute NameFormat** to _Basic_. + Keycloak client mappers Email +* Within you realm, select **Realm settings** and download **SAML 2.0 Identity Provider Metadata** + Keycloak SAML metadata +* [Configure and enable](#configure-saml-for-coroot) SAML authentication for Coroot. + +### Configure SAML for Coroot * Navigate to the **Project Settings** > **Organization** > **Single Sign-On (SAML)** section. SSO @@ -92,5 +119,7 @@ This makes the login process easier and more secure by centralizing authenticati Each team member authenticated through the Identity Provider will be displayed in the Users list in Coroot, allowing you to manually change their roles. -Use http://COROOT_ADDRESS/login page and the admin user credentials to log in to your Coroot instance if you encounter any issues with SSO. +### Troubleshooting + +Use http://<COROOT_ADDRESS>/login page and the **admin** user credentials to log in to your Coroot instance if you encounter any issues with SSO. diff --git a/docs/docs/configuration/configuration.md b/docs/docs/configuration/configuration.md index 230ed32a9..ee65d3f1f 100644 --- a/docs/docs/configuration/configuration.md +++ b/docs/docs/configuration/configuration.md @@ -114,10 +114,101 @@ license_key: # License key for Coroot Enterprise Edition. # The project defined here will be created if it does not exist # and will be configured with the provided API keys. # If a project with the same name already exists (e.g., configured via the UI), -# its API keys will be replaced. +# its API keys and other settings will be replaced. projects: # Create or update projects (configuration file only). - name: # Project name (e.g., production, staging; must be unique; required). - api_keys: # Project API keys, used by agents to send telemetry data (required). + # Project API keys, used by agents to send telemetry data (required). + apiKeys: - key: # Random string or UUID (must be unique; required). description: # The API key description (optional). -``` \ No newline at end of file + # Project notification integrations. + notificationIntegrations: + baseURL: # The URL of Coroot instance (required). Used for generating links in notifications. + slack: + token: # Slack Bot User OAuth Token (required). + defaultChannel: # Default channel (required). + incidents: false # Notify of incidents (SLO violations). + deployments: false # Notify of deployments. + teams: + webhookURL: # Microsoft Teams Webhook URL (required). + incidents: false # Notify of incidents (SLO violations). + deployments: false # Notify of deployments. + pagerduty: + integrationKey: # PagerDuty Integration Key (required). + incidents: false # Notify of incidents (SLO violations). + opsgenie: + apiKey: # Opsgenie API Key (required). + euInstance: false # EU instance of Opsgenie. + incidents: false # Notify of incidents (SLO violations). + webhook: + url: # Webhook URL (required). + tlsSkipVerify: false # Whether to skip verification of the Webhook server's TLS certificate. + basicAuth: # Basic auth credentials. + username: + password: + customHeaders: # Custom headers to include in requests. + - key: + value: + incidents: false # Notify of incidents (SLO violations). + deployments: false # Notify of deployments. + incidentTemplate: "" # Incident template (required if `incidents: true`). + deploymentTemplate: "" # Deployment template (required if `deployments: true`). + # Project application category settings. + applicationCategories: + - name: # Application category name (required). + customPatterns: # List of glob patterns in the / format. + - staging/* + - test-*/* + notificationSettings: # Category notification settings. + incidents: # Notify of incidents (SLO violations). + enabled: true + slack: + enabled: true + channel: ops + teams: + enabled: false + pagerduty: + enabled: false + opsgenie: + enabled: false + webhook: + enabled: false + deployments: # Notify of deployments. + enabled: true + slack: + enabled: true + channel: general + teams: + enabled: false + webhook: + enabled: false + # Project custom applications settings. + customApplications: + - name: custom-app + instancePatterns: + - app@node1 + - app@node2 + +# Single Sign-on configuration (Coroot Enterprise edition only). +sso: + enabled: false + defaultRole: Viewer # Default role for authenticated users (Admin, Editor, Viewer, or a custom role). + saml: + # SAML Identity Provider Metadata XML (required). + metadata: | + + ... + + +# AI configuration (Coroot Enterprise edition only). +ai: + provider: # AI model provider (one of: anthropic, openai, or openai_compatible). + anthropic: + apiKey: # Anthropic API key. + openai: + apiKey: # OpenAI API key. + openaiCompatible: + apiKey: # API key. + baseUrl: # Base URL (e.g., https://generativelanguage.googleapis.com/v1beta/openai). + model: # Model name (e.g., gemini-2.5-pro-preview-06-05). +``` diff --git a/docs/docs/configuration/coroot-node-agent.md b/docs/docs/configuration/coroot-node-agent.md new file mode 100644 index 000000000..9c7d80724 --- /dev/null +++ b/docs/docs/configuration/coroot-node-agent.md @@ -0,0 +1,50 @@ +--- +sidebar_position: 11 +--- + +# Coroot-node-agent + +Coroot-node-agent is a Prometheus- and OpenTelemetry-compatible agent that gathers comprehensive telemetry data about +all containers running on a node and the node itself. + +It collects and exports the following telemetry: + +- **Metrics**: Exported in Prometheus format or sent using the Prometheus Remote Write protocol. +- **Traces**: eBPF-based network and application traces sent via OTLP/HTTP (OpenTelemetry protocol). +- **Logs**: Discovers container logs and sends them via OTLP/HTTP. +- **Profiles**: Uses the Pyroscope eBPF profiler to collect CPU profiles and sends them via a custom HTTP-based protocol. + +## Node Agent Configuration + +You can configure coroot-node-agent using command-line flags or environment variables. + +| Flag | Env Variable | Default | Description | +|------|--------------|---------|-------------| +| `--listen` | `LISTEN` | `0.0.0.0:80` | HTTP listen address | +| `--cgroupfs-root` | `CGROUPFS_ROOT` | `/sys/fs/cgroup` | Path to the host's cgroup filesystem root | +| `--disable-log-parsing` | `DISABLE_LOG_PARSING` | `false` | Disable container log parsing | +| `--disable-pinger` | `DISABLE_PINGER` | `false` | Disable ICMP ping to upstreams | +| `--disable-l7-tracing` | `DISABLE_L7_TRACING` | `false` | Disable application-layer (L7) tracing | +| `--container-allowlist` | `CONTAINER_ALLOWLIST` | – | List of allowed containers (regex patterns) | +| `--container-denylist` | `CONTAINER_DENYLIST` | – | List of denied containers (regex patterns) | +| `--exclude-http-requests-by-path` | `EXCLUDE_HTTP_REQUESTS_BY_PATH` | – | Exclude HTTP paths from metrics/traces | +| `--track-public-network` | `TRACK_PUBLIC_NETWORK` | `0.0.0.0/0` | Public IP networks to track | +| `--ephemeral-port-range` | `EPHEMERAL_PORT_RANGE` | `32768-60999` | TCP ports to exclude from tracking | +| `--provider` | `PROVIDER` | – | `provider` label for `node_cloud_info` | +| `--region` | `REGION` | – | `region` label for `node_cloud_info` | +| `--availability-zone` | `AVAILABILITY_ZONE` | – | `availability_zone` label for `node_cloud_info` | +| `--instance-type` | `INSTANCE_TYPE` | – | `instance_type` label for `node_cloud_info` | +| `--instance-life-cycle` | `INSTANCE_LIFE_CYCLE` | – | `instance_life_cycle` label for `node_cloud_info` | +| `--log-per-second` | `LOG_PER_SECOND` | `10.0` | Rate limit for logs per second | +| `--log-burst` | `LOG_BURST` | `100` | Max burst for log rate limiting | +| `--max-label-length` | `MAX_LABEL_LENGTH` | `4096` | Max metric label length | +| `--collector-endpoint` | `COLLECTOR_ENDPOINT` | – | Unified base URL for telemetry export | +| `--api-key` | `API_KEY` | – | Coroot API key | +| `--metrics-endpoint` | `METRICS_ENDPOINT` | – | Custom URL for metrics export | +| `--traces-endpoint` | `TRACES_ENDPOINT` | – | Custom URL for traces export | +| `--logs-endpoint` | `LOGS_ENDPOINT` | – | Custom URL for logs export | +| `--profiles-endpoint` | `PROFILES_ENDPOINT` | – | Custom URL for profiles export | +| `--insecure-skip-verify` | `INSECURE_SKIP_VERIFY` | `false` | Skip TLS certificate verification | +| `--scrape-interval` | `SCRAPE_INTERVAL` | `15s` | How often to collect internal metrics | +| `--wal-dir` | `WAL_DIR` | `/tmp/coroot-node-agent` | Directory for WAL storage | +| `--max-spool-size` | `MAX_SPOOL_SIZE` | `500MB` | Max size for on-disk spool | diff --git a/docs/docs/configuration/custom-applications.md b/docs/docs/configuration/custom-applications.md index a7979dd03..83db2707c 100644 --- a/docs/docs/configuration/custom-applications.md +++ b/docs/docs/configuration/custom-applications.md @@ -10,8 +10,29 @@ Coroot groups individual containers into applications using the following approa * Non-Kubernetes containers: Containers such as Docker containers or Systemd units are grouped into applications by their names. For example, Systemd services named mysql on different hosts are grouped into a single application called mysql. This default approach works well in most cases. However, since no one knows your system better than you do, -Coroot allows you to manually adjust application groupings to better fit your specific needs. -You can match desired application instances by defining [glob patterns](https://en.wikipedia.org/wiki/Glob_(programming)) for `instance_name`. Note that this is not applicable to Kubernetes applications. +Coroot allows you to manually adjust application groupings to better fit your specific needs. + +## Kubernetes annotations + +To define a custom name for a Kubernetes application (Deployment, StatefulSet, DaemonSet, or CronJob), +annotate it with the `coroot.com/custom-application-name` annotation. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: some-app-123 + namespace: default + annotations: + coroot.com/custom-application-name: some-app +``` + +The custom application name can also be defined using Pod annotations. + +## Pattern-based configuration + +For non-Kubernetes applications, +You can match desired application instances by defining [glob patterns](https://en.wikipedia.org/wiki/Glob_(programming)) for `instance_name`. For example, if you have 10 Apache HTTPD instances running on 10 nodes as systemd services, Coroot typically groups them into one application by their unit name. diff --git a/docs/docs/costs/overview.md b/docs/docs/costs/overview.md index 228df88d7..9b3a38757 100644 --- a/docs/docs/costs/overview.md +++ b/docs/docs/costs/overview.md @@ -53,11 +53,25 @@ By leveraging this information, it can quickly determine the price of each node. To calculate the cost of each resource separately, Coroot assumes that 1 CPU core costs the same as 1GB of memory. By doing so, the CPU and memory usage of every application can be easily translated into $$$. +## Custom cloud pricing + +Out of the box, Coroot supports pricing models for AWS, GCP, and Azure. For nodes not running in these clouds, +Coroot uses predefined prices based on GCP’s C4 machine family in the us-central1 region. +These prices can be adjusted in the UI. + +Custom Cloud Pricing + +The prices are defined per hour, based on a single vCPU and 1 GB of memory. + +Custom Cloud Pricing Configuration + + + + ## Limitations Coroot has some limitations that are important to note. -* Only AWS, GCP, and Azure are currently supported * Standard pricing (without discounts) * The cost calculation considers only CPU, Memory usage and Traffic (egress, cross-AZ) (support for GPUs and volumes will be added later) * Currently, the cost calculation considers only compute, AWS RDS, and AWS ElastiCache instances (support for EKS/AKS/GKE will be added later) diff --git a/docs/docs/dashboards/_category_.yaml b/docs/docs/dashboards/_category_.yaml new file mode 100644 index 000000000..a68b5ca69 --- /dev/null +++ b/docs/docs/dashboards/_category_.yaml @@ -0,0 +1,6 @@ +--- +label: Dashboards +position: 12 +link: + type: generated-index + slug: /dashboards/ diff --git a/docs/docs/dashboards/overview.md b/docs/docs/dashboards/overview.md new file mode 100644 index 000000000..a7262da86 --- /dev/null +++ b/docs/docs/dashboards/overview.md @@ -0,0 +1,60 @@ +--- +sidebar_position: 1 +--- + +# Overview + +Coroot is an opinionated observability solution out of the box. +This means it comes with a wide range of predefined inspections and dashboards that help you quickly identify and troubleshoot common issues without any manual configuration. + +However, every environment is unique, and sometimes you need to go beyond the built-in views. +That's where custom dashboards come in. Coroot allows you to create your own dashboards to visualize any metrics that matter to you, turning Coroot into a true single pane of glass for all your observability needs. + +Whether you're tracking business KPIs, third-party metrics, or application-specific performance indicators, custom dashboards help you extend Coroot's built-in capabilities. + +To learn how to gather custom metrics in Coroot, follow this [guide](/metrics/custom-metrics). + +This page will walk you through how to create a new dashboard, add and organize panels, and build effective dashboards tailored to your environment. + +Let's get started. + +## Create a dashboard + +1. Navigate to **Dashboards** and click **Add dashboard**. +2. Provide a name for your dashboard and, optionally, a description. + Coroot Dashboards - Create Dashboard +3. Click **Save**. + +## Add a panel + +:::info +Currently, only the `Time series chart` panel type is supported. +::: + +1. Click **Add panel**. +2. Enter a **Name** and optionally a **Description**. +3. Choose or create a panel **Group**. +4. Enter a PromQL expression in the **Query #1** field. +5. Optionally, provide a **Legend** for the query. You can reference label values using the format `{{ label_name }}`. + Coroot Dasboards - Add Panel +6. You can add additional PromQL queries if needed. +7. Click **Apply**. +8. Adjust the panel’s size by dragging its bottom-right corner, and move it by dragging the top-right corner. + Coroot Dashboards - Save Dashboard +9. Click **Save** to save the dashboard. + +## Panel groups + +Panel groups let you organize related panels under a shared title. +They make it easier to keep things tidy and focus on specific parts of your system, such as resource usage, database metrics, or custom business indicators. + +You can collapse groups by default to reduce visual clutter, which is especially useful in larger dashboards. +Groups are easy to reorder with `↑` and `↓` buttons, and you can move panels between them whenever needed to keep everything organized. +Coroot Dashboards - Panel Groups + +## Dashboard Permissions +Dashboards in Coroot follow role-based access control (RBAC). +In the Community edition, only Admins and Editors can create or edit dashboards. Viewers can access all dashboards in read-only mode. + +The [Enterprise edition](https://coroot.com/enterprise/) offers more control with fine-grained permissions. +You can define who can view or edit specific dashboards, making it easy to manage access across teams and environments. \ No newline at end of file diff --git a/docs/docs/inspections/gpu.md b/docs/docs/inspections/gpu.md new file mode 100644 index 000000000..4dc61e07c --- /dev/null +++ b/docs/docs/inspections/gpu.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 5 +toc_max_heading_level: 2 +--- + +# GPU + +Coroot provides real-time visibility into GPU usage across your applications, helping you understand performance bottlenecks, +detect resource contention, and optimize workloads running on GPU-enabled nodes. + +CPU + + +## Overview + +The **GPU** tab in the Application view (e.g., `ollama`) shows detailed metrics collected from NVIDIA GPUs used by your workloads. +These metrics include both per-application and global GPU stats, presented in easy-to-read time-series charts. + +## Dashboard + +### GPU Usage by Application (%) +Shows the percentage of GPU compute resources used by each instance of the application (e.g., `ollama-56b6fd44bc-4zlvq`). +This reflects SM (streaming multiprocessor) usage. + +### GPU Memory Usage by Application (%) +Tracks how much GPU memory is used by each application container, expressed as a percentage of total available memory. + +### GPU Utilization (%) +Displays the overall utilization of the GPU's compute resources, averaged across all consumers. +This helps identify how busy the GPU is during the observed timeframe. + +### GPU Memory Utilization (%) +Indicates how much of the GPU memory is in use, averaged across the timeframe. Spikes here can indicate memory-heavy operations like loading large models. + +### GPU Consumers +Breaks down GPU utilization by each container using the GPU. This provides transparency into which workloads are actively consuming compute resources. + +### GPU Memory Consumers +Shows how GPU memory is distributed across running containers. Useful for detecting memory-hungry processes. + +### GPU Temperature (°C) +Reports the real-time operating temperature of the GPU. Overheating may suggest thermal throttling or cooling issues. + +### GPU Power Usage (Watts) +Displays the power consumption of the GPU in watts. Power spikes usually correlate with high GPU utilization or memory usage. + diff --git a/docs/docs/inspections/memory.md b/docs/docs/inspections/memory.md index 576c988ba..5e6741787 100644 --- a/docs/docs/inspections/memory.md +++ b/docs/docs/inspections/memory.md @@ -89,8 +89,8 @@ Node memory usage = (`total` - `available`) / `total` * 100% ### Memory consumers
- CPU consumers - CPU consumers + Memory consumers + Memory consumers
When you observe high memory usage on a particular node, diff --git a/docs/docs/inspections/slo.md b/docs/docs/inspections/slo.md index 2db3ca043..15b29f951 100644 --- a/docs/docs/inspections/slo.md +++ b/docs/docs/inspections/slo.md @@ -31,3 +31,40 @@ You can also define any Prometheus [histogram](https://prometheus.io/docs/practi Custom Latency SLO +## Kubernetes annotations + +You can define Service Level Objectives (SLOs) using annotations on Kubernetes objects such as Deployment, StatefulSet, DaemonSet, or CronJob. +Only SLO thresholds are defined via annotations — Coroot uses its eBPF-based metrics to calculate the corresponding Service Level Indicators (SLIs). + +Application latency in Coroot is represented as a histogram with a fixed set of buckets: `5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s`. +When evaluating SLO compliance, Coroot compares the number of requests that meet the latency objective to the total number of requests. +Therefore, the `slo-latency-threshold` value must match one of the predefined buckets. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: some-app + namespace: default + annotations: + coroot.com/slo-availability-objective: 99.9% + coroot.com/slo-latency-objective: 99.9% + coroot.com/slo-latency-threshold: 100ms +``` +Once SLOs are defined through annotations, they cannot be edited in the UI. + +To disable a specific SLO for an application, set its objective to `0%`: +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: some-app + namespace: default + annotations: + coroot.com/slo-availability-objective: 99% + coroot.com/slo-latency-objective: 0% # Latency SLO is disabled +``` + +:::info +Defining custom SLI metrics via annotations is not supported yet. +::: \ No newline at end of file diff --git a/docs/docs/installation/_category_.yaml b/docs/docs/installation/_category_.yaml index aeb6f5e11..83d6a64f3 100644 --- a/docs/docs/installation/_category_.yaml +++ b/docs/docs/installation/_category_.yaml @@ -4,5 +4,3 @@ position: 2 link: type: generated-index slug: /installation/ - - diff --git a/docs/docs/installation/docker-swarm.md b/docs/docs/installation/docker-swarm.md index 85c0e75ea..af0d6e638 100644 --- a/docs/docs/installation/docker-swarm.md +++ b/docs/docs/installation/docker-swarm.md @@ -92,8 +92,8 @@ This initializes a new Docker Swarm and joins the current node as a manager. Deploy the Coroot stack to your cluster by running the following command on the manager node. Before applying, you can review the configuration file in Coroot's GitHub repository: docker-swarm-stack.yaml ``` -curl -fsS https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/docker-swarm-stack.yaml | \ - LICENSE_KEY="COROOT-LICENSE-KEY-HERE" docker stack deploy -c - coroot-ee +curl -fsS https://raw.githubusercontent.com/coroot/coroot/main/deploy/docker-swarm-stack.yaml | \ + LICENSE_KEY="COROOT-LICENSE-KEY-HERE" docker stack deploy -c - coroot ``` **Step #3: Validate the deployment** @@ -103,7 +103,7 @@ Here's an example of how the output might look: ``` NAME SERVICES -coroot-ee 4 +coroot 4 ``` **Step #4: Installing coroot-node-agent** @@ -132,7 +132,7 @@ Access Coroot through any node in your Docker Swarm cluster using its published To uninstall Coroot run the following command: ``` -docker stack rm coroot-ee +docker stack rm coroot ```
diff --git a/docs/docs/installation/docker.md b/docs/docs/installation/docker.md index da54ccc0e..cd9763c82 100644 --- a/docs/docs/installation/docker.md +++ b/docs/docs/installation/docker.md @@ -87,7 +87,7 @@ To deploy Coroot using Docker Compose, run the following command. Before applyin you can review the configuration file in Coroot's GitHub repository: docker-compose.yaml ``` -curl -fsS https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/docker-compose.yaml | \ +curl -fsS https://raw.githubusercontent.com/coroot/coroot/main/deploy/docker-compose.yaml | \ LICENSE_KEY="COROOT-LICENSE-KEY-HERE" docker compose -f - up -d ``` @@ -103,11 +103,11 @@ You should see an output similar to this if the deployment is successful: ``` CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES -870119cb6859 ghcr.io/coroot/coroot-cluster-agent "coroot-cluster-agen…" 29 seconds ago Up 16 seconds coroot-ee-cluster-agent-1 -6f3b8f1c821c ghcr.io/coroot/coroot-ee:1.5.4 "/opt/coroot/coroot-…" 42 seconds ago Up 16 seconds 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp coroot-ee-coroot-1 -320e9154a8ba clickhouse/clickhouse-server:24.3 "/entrypoint.sh" About a minute ago Up About a minute 8123/tcp, 9009/tcp, 127.0.0.1:9000->9000/tcp coroot-ee-clickhouse-1 -76b5968068f0 prom/prometheus:v2.45.4 "/bin/prometheus --c…" About a minute ago Up About a minute 127.0.0.1:9090->9090/tcp coroot-ee-prometheus-1 -51e91e09e58a ghcr.io/coroot/coroot-node-agent "coroot-node-agent -…" About a minute ago Up About a minute coroot-ee-node-agent-1 +870119cb6859 ghcr.io/coroot/coroot-cluster-agent "coroot-cluster-agen…" 29 seconds ago Up 16 seconds coroot-cluster-agent-1 +6f3b8f1c821c ghcr.io/coroot/coroot-ee:1.5.4 "/opt/coroot/coroot-…" 42 seconds ago Up 16 seconds 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp coroot-coroot-1 +320e9154a8ba clickhouse/clickhouse-server:24.3 "/entrypoint.sh" About a minute ago Up About a minute 8123/tcp, 9009/tcp, 127.0.0.1:9000->9000/tcp coroot-clickhouse-1 +76b5968068f0 prom/prometheus:v2.45.4 "/bin/prometheus --c…" About a minute ago Up About a minute 127.0.0.1:9090->9090/tcp coroot-prometheus-1 +51e91e09e58a ghcr.io/coroot/coroot-node-agent "coroot-node-agent -…" About a minute ago Up About a minute coroot-node-agent-1 ``` **Step #4: Accessing Coroot** @@ -121,7 +121,7 @@ http://NODE_IP_ADDRESS:8080/. To uninstall Coroot run the following command: ``` -curl -fsS https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/docker-compose.yaml | \ +curl -fsS https://raw.githubusercontent.com/coroot/coroot/main/deploy/docker-compose.yaml | \ docker compose rm -f -s -v ``` diff --git a/docs/docs/installation/k8s-operator.md b/docs/docs/installation/k8s-operator.md index 68f49eedd..23fbfbf5d 100644 --- a/docs/docs/installation/k8s-operator.md +++ b/docs/docs/installation/k8s-operator.md @@ -8,7 +8,7 @@ The best way to deploy Coroot into a Kubernetes or OpenShift cluster is by using The operator simplifies the deployment of all required components and enables scaling as needed. It supports the deployment of both Coroot Community and Enterprise editions. -## Operator installation +## Operator installation Add the Coroot helm chart repo: @@ -58,6 +58,7 @@ spec: # type: # Service type (e.g., ClusterIP, NodePort, LoadBalancer). # port: # Service port number. # nodePort: # NodePort number (if type is NodePort). +# annotations: # Annotations for Service. # ingress: # Ingress configuration for Coroot. # className: Ingress class name (e.g., nginx, traefik; if not set the default IngressClass will be used). # host: # Domain name for Coroot (e.g., coroot.company.com). @@ -84,6 +85,9 @@ spec: # Configuration for Coroot Enterprise Edition. # enterpriseEdition: # licenseKey: COROOT-1111-111 # License key for Coroot Enterprise Edition. +# licenseKeySecret: # Secret containing the license key. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. # image: # If unspecified, the operator will automatically update Coroot EE to the latest version from Coroot's public registry. # name: # Specifies the full image reference (e.g., /coroot-ee:) # pullPolicy: # The image pull policy (e.g., Always, IfNotPresent, Never). @@ -91,9 +95,14 @@ spec: # Configures the operator to install only the node-agent and cluster-agent. # agentsOnly: -# corootURL: http://COROOT_IP:PORT/ # URL of the Coroot instance to which agents send metrics, logs, traces, and profiles. +# corootURL: http(s)://COROOT_IP:PORT/ # URL of the Coroot instance to which agents send metrics, logs, traces, and profiles. +# tlsSkipVerify: false # Whether to skip verification of the Coroot server's TLS certificate. -# apiKey: # The API key used by agents when sending telemetry to Coroot. +# The API key used by agents when sending telemetry to Coroot. +# apiKey: # Plain-text API key. Prefer using `apiKeySecret` for better security. +# apiKeySecret: # Secret containing the API key. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. # Configuration for Coroot Node Agent. # nodeAgent: @@ -116,6 +125,15 @@ spec: # name: # Specifies the full image reference (e.g., /coroot-node-agent:) # pullPolicy: # The image pull policy (e.g., Always, IfNotPresent, Never). # pullSecrets: [] # The pull secrets for pulling the image from a private registry. +# trackPublicNetworks: ["0.0.0.0/0"] # Allow track connections to the specified IP networks (e.g., Y.Y.Y.Y/mask). By default, Coroot tracks all connections. +# logCollector: +# collectLogBasedMetrics: true # Collect log-based metrics. Disables `collectLogEntries` if set to false. +# collectLogEntries: true # Collect log entries and store them in ClickHouse. +# ebpfTracer: +# enabled: true # Collect traces and store them in ClickHouse. +# sampling: "1.0" # Trace sampling rate (0.0 to 1.0). +# ebpfProfiler: +# enabled: true # Collect profiles and store them in ClickHouse. # Configuration for Coroot Cluster Agent. # clusterAgent: @@ -146,14 +164,15 @@ spec: # reclaimPolicy: Delete # Options: Retain (keeps PVC) or Delete (removes PVC on Coroot CR deletion). # resources: # Resource requests and limits for Prometheus. # podAnnotations: # Annotations for Prometheus. -# retention: 2d # Metrics retention time (e.g. 4h, 3d, 2w, 1y) +# retention: 2d # Metrics retention time (e.g. 4h, 3d, 2w, 1y). +# outOfOrderTimeWindow: 1h # The `storage.tsdb.out_of_order_time_window` Prometheus setting. # image: # If unspecified, the operator will install Prometheus from Coroot's public registry. -# name: # Specifies the full image reference (e.g., /prometheus:) +# name: # Specifies the full image reference (e.g., /prometheus:). # pullPolicy: # The image pull policy (e.g., Always, IfNotPresent, Never). # pullSecrets: [] # The pull secrets for pulling the image from a private registry. # Use an external Prometheus instance instead of deploying one. -# NOTE: Remote write receiver must be enabled in your Prometheus via the --web.enable-remote-write-receiver flag. +# NOTE: Remote write receiver must be enabled in your Prometheus via the `--web.enable-remote-write-receiver` flag. # externalPrometheus: # url: # http(s)://: or http(s)://: or http(s)://:. # tlsSkipVerify: false # Whether to skip verification of the Prometheus server's TLS certificate. @@ -214,7 +233,7 @@ spec: # replicas: 1 # Number of Coroot StatefulSet pods. -# Store configuration in a Postgres DB instead of SQLite (required if replicas > 1). +# Store configuration in a Postgres DB instead of SQLite (required if `replicas` > 1). # postgres: # host: # Postgres host or service name. # port: # Postgres port (optional, default 5432). @@ -228,10 +247,138 @@ spec: # sslmode: disable # The project defined here will be created if it does not exist and will be configured with the provided API keys. -# If a project with the same name already exists (e.g., configured via the UI), its API keys will be replaced. +# If a project with the same name already exists (e.g., configured via the UI), its API keys and other settings will be replaced. # projects: # Create or update projects. # - name: # Project name (e.g., production, staging; required). -# apiKeys: # Project API keys, used by agents to send telemetry data (required). -# - key: # Random string or UUID (must be unique; required). -# description: # The API key description (optional). +# # Project API keys, used by agents to send telemetry data (required). +# apiKeys: +# - description: # The API key description (optional). +# key: # Plain-text API key (a random string or UUID). Must be unique. Prefer using `keySecret` for better security. +# keySecret: # Secret containing the API key. Generated automatically if missing. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# # Project notification integrations. +# notificationIntegrations: +# baseURL: # The URL of Coroot instance (required). Used for generating links in notifications. +# slack: +# token: # Slack Bot User OAuth Token (required). +# tokenSecret: # Secret containing the Token. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# defaultChannel: # Default channel (required). +# incidents: false # Notify of incidents (SLO violations). +# deployments: false # Notify of deployments. +# teams: +# webhookURL: # Microsoft Teams Webhook URL (required). +# webhookURLSecret: # Secret containing the Webhook URL. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# incidents: false # Notify of incidents (SLO violations). +# deployments: false # Notify of deployments. +# pagerduty: +# integrationKey: # PagerDuty Integration Key (required). +# integrationKeySecret: # Secret containing the Integration Key. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# incidents: false # Notify of incidents (SLO violations). +# opsgenie: +# apiKey: # Opsgenie API Key (required). +# apiKeySecret: # Secret containing the API Key. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# euInstance: false # EU instance of Opsgenie. +# incidents: false # Notify of incidents (SLO violations). +# webhook: +# url: # Webhook URL (required). +# tlsSkipVerify: false # Whether to skip verification of the Webhook server's TLS certificate. +# basicAuth: # Basic auth credentials. +# username: # Basic auth username. +# password: # Basic auth password. +# passwordSecret: # Secret containing password. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# customHeaders: # Custom headers to include in requests. +# - key: +# value: +# incidents: false # Notify of incidents (SLO violations). +# deployments: false # Notify of deployments. +# incidentTemplate: "" # Incident template (required if `incidents: true`). +# deploymentTemplate: "" # Deployment template (required if `deployments: true`). +# # Project application category settings. +# applicationCategories: +# - name: # Application category name (required). +# customPatterns: # List of glob patterns in the / format. +# - staging/* +# - test-*/* +# notificationSettings: # Category notification settings. +# incidents: # Notify of incidents (SLO violations). +# enabled: true +# slack: +# enabled: true +# channel: ops +# teams: +# enabled: false +# pagerduty: +# enabled: false +# opsgenie: +# enabled: false +# webhook: +# enabled: false +# deployments: # Notify of deployments. +# enabled: true +# slack: +# enabled: true +# channel: general +# teams: +# enabled: false +# webhook: +# enabled: false +# # Project custom applications settings. +# customApplications: +# - name: custom-app +# instancePatterns: +# - app@node1 +# - app@node2 + +# Single Sign-on configuration (Coroot Enterprise edition only). +# sso: +# enabled: false +# defaultRole: Viewer # Default role for authenticated users (Admin, Editor, Viewer, or a custom role). +# saml: +# # SAML Identity Provider Metadata XML (required). +# metadata: | +# +# ... +# +# metadataSecret: # Secret containing the Metadata XML. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. + +# AI configuration (Coroot Enterprise edition only). +# ai: +# provider: # AI model provider (one of: anthropic, openai, or openai_compatible). +# anthropic: +# apiKey: # Anthropic API key (required). +# apiKeySecret: # Secret containing the API key. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# openai: +# apiKey: # OpenAI API key (required). +# apiKeySecret: # Secret containing the API key. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# openaiCompatible: +# apiKey: # API key (required). +# apiKeySecret: # Secret containing the API key. +# name: # Name of the secret to select from. +# key: # Key of the secret to select from. +# baseUrl: # Base URL (e.g., https://generativelanguage.googleapis.com/v1beta/openai). +# model: # Model name (e.g., gemini-2.5-pro-preview-06-05). +``` + +## Operator upgrade + +```bash +helm repo update coroot +helm upgrade -n coroot coroot-operator coroot/coroot-operator ``` diff --git a/docs/docs/installation/requirements.md b/docs/docs/installation/requirements.md index 50373316c..e3f63b187 100644 --- a/docs/docs/installation/requirements.md +++ b/docs/docs/installation/requirements.md @@ -4,7 +4,7 @@ sidebar_position: 1 # Requirements - * Coroot relies heavily on eBPF, therefore, the minimum supported Linux kernel version is 4.16. + * Coroot relies heavily on eBPF, therefore, the minimum supported Linux kernel version is 5.1. * eBPF-based continuous profiling utilizes CO-RE. CO-RE is supported by most modern Linux distributions, including: * Ubuntu 20.10 and above * Debian 11 and above diff --git a/docs/docs/installation/rhel.md b/docs/docs/installation/rhel.md index f1f0d60b0..73bbc9a95 100644 --- a/docs/docs/installation/rhel.md +++ b/docs/docs/installation/rhel.md @@ -119,14 +119,9 @@ Access Coroot at: http://NODE_IP:8080. To uninstall Coroot run the following command: ``` -/usr/bin/coroot-uninstall.sh +/usr/local/bin/coroot-uninstall.sh ``` -Uninstall coroot-node-agent: - -``` -/usr/bin/coroot-node-agent-uninstall.sh -``` @@ -219,7 +214,7 @@ sudo systemctl enable prometheus **Step #3: Installing Coroot** ``` -curl -sfL https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/install.sh | \ +curl -sfL https://raw.githubusercontent.com/coroot/coroot/main/deploy/install.sh | \ LICENSE_KEY="COROOT-LICENSE-KEY-HERE" \ BOOTSTRAP_PROMETHEUS_URL="http://127.0.0.1:9090" \ BOOTSTRAP_REFRESH_INTERVAL=15s \ @@ -236,13 +231,8 @@ Access Coroot at: http://NODE_IP:8080. To uninstall Coroot run the following command: ``` -/usr/bin/coroot-uninstall.sh +/usr/local/bin/coroot-uninstall.sh ``` -Uninstall coroot-node-agent: - -``` -/usr/bin/coroot-node-agent-uninstall.sh -``` diff --git a/docs/docs/installation/ubuntu.md b/docs/docs/installation/ubuntu.md index d3f7ec61b..2c5f2a900 100644 --- a/docs/docs/installation/ubuntu.md +++ b/docs/docs/installation/ubuntu.md @@ -74,14 +74,9 @@ Access Coroot at: http://NODE_IP:8080. To uninstall Coroot run the following command: ```bash -/usr/bin/coroot-uninstall.sh +/usr/local/bin/coroot-uninstall.sh ``` -Uninstall coroot-node-agent: - -```bash -/usr/bin/coroot-node-agent-uninstall.sh -``` @@ -130,7 +125,7 @@ sudo service prometheus restart **Step #3: Installing Coroot** ```bash -curl -sfL https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/install.sh | \ +curl -sfL https://raw.githubusercontent.com/coroot/coroot/main/deploy/install.sh | \ LICENSE_KEY="COROOT-LICENSE-KEY-HERE" \ BOOTSTRAP_PROMETHEUS_URL="http://127.0.0.1:9090" \ BOOTSTRAP_REFRESH_INTERVAL=15s \ @@ -156,13 +151,8 @@ Access Coroot at: http://NODE_IP:8080. To uninstall Coroot run the following command: ```bash -/usr/bin/coroot-ee-uninstall.sh +/usr/local/bin/coroot-uninstall.sh ``` -Uninstall coroot-node-agent: - -```bash -/usr/bin/coroot-node-agent-uninstall.sh -``` diff --git a/docs/docs/logs/application.md b/docs/docs/logs/application.md new file mode 100644 index 000000000..53abb38a4 --- /dev/null +++ b/docs/docs/logs/application.md @@ -0,0 +1,28 @@ +--- +sidebar_position: 2 +--- + +# Application logs + +In the Application view, Coroot allows you to analyze and correlate application telemetry (availability, latency, CPU metrics, etc.) with raw logs and recurring log patterns. +Logs are pre-filtered by application, eliminating the need to locate it manually in the main Logs view. + +Coroot Log Monitoring + + +## Log patterns +To quickly understand what types of errors appeared in the logs at a particular time, you can switch to the "Patterns" mode. + +Log patterns + +By clicking on any pattern, you can navigate to the original messages that match this pattern (Show Messages). + +Log pattern details + +Log pattern messages + + +## Event details +Clicking on a specific event from the list allows you to access its details, including the full message text, severity, and OpenTelemetry attributes. You can also jump to similar messages that match the same pattern. + +Log message details diff --git a/docs/docs/logs/overview.md b/docs/docs/logs/overview.md index 8ee638a79..2d3dcaeac 100644 --- a/docs/docs/logs/overview.md +++ b/docs/docs/logs/overview.md @@ -4,37 +4,35 @@ sidebar_position: 1 # Overview -Coroot's Logs monitoring enables you to effortlessly analyze your application logs and correlate them with traces, metrics, and profiles. -All logs are grouped by application, eliminating the need for manual navigation. +Coroot's Logs monitoring enables you to effortlessly analyze your application logs and correlate them with traces, metrics, and profiles. -Coroot Log Monitoring - -Coroot's node-agent automatically discovers and gathers logs from all containers on a node, then transmits them to Coroot. -Additionally, it performs low-overhead log analysis right on the node to identify message severities and recurring patterns. +Coroot's node-agent automatically discovers and gathers logs from all containers on a node, then transmits them to Coroot. +Additionally, it performs low-overhead log analysis right on the node to identify message severities and recurring patterns. This process is seamless and compatible with a wide range of log formats, providing valuable meta-information for quick and easy log analysis. -## Log patterns -To quickly understand what types of errors appeared in the logs at a particular time, you can switch to the "Patterns" mode. - -Log patterns +Coroot Log Monitoring -By clicking on any pattern, you can view the message distribution across application instances and navigate to the original messages that match this pattern (Show Messages). +## Event details +Clicking on a specific event from the list allows you to access its details, including the full message text, severity, and OpenTelemetry attributes. +You can also jump to similar messages that match the same pattern. -Log pattern details +Log message details -Log pattern messages +## Quick links -## Event details -Clicking on a specific event from the list allows you to access its details, including the full message text, severity, and OpenTelemetry attributes. You can also jump to similar messages that match the same pattern. +Clicking the application name allows you to filter log entries for that application. -Log message details +Application quick links ## Correlating logs and traces If you instrument your apps with the OpenTelemetry SDK to send logs to Coroot's OpenTelemetry collector along with the tracing context, you can instantly navigate to the corresponding trace with just one click. -Correlating logs and traces +Correlating logs and traces +Correlating logs and traces +Clicking `Show logs` in the Trace view retrieves all log entries associated with the corresponding TraceId. +Correlating logs and traces diff --git a/docs/docs/logs/querying.md b/docs/docs/logs/querying.md new file mode 100644 index 000000000..536ef8c85 --- /dev/null +++ b/docs/docs/logs/querying.md @@ -0,0 +1,23 @@ +--- +sidebar_position: 3 +--- + +# Querying + +Logs can be filtered by severity, message content, and attributes. + +Coroot Log Filtering + +Coroot supports the `=` (equal), `~` (regex match), `!=` (not equal), and `!~` (regex not match) operators for attributes, +and the `🔍` (contains) operator for message body. +Under the hood, the `contains` operator uses token-based search with ClickHouse full-text indexes for improved performance. + +All positive filters (`=`, `~`) with the same attribute name are combined using `OR`, while negative filters (`!=`, `!~`) are combined using `AND`. + +To make filtering easier, Coroot provides suggestions for attribute names and values. + +Coroot Log Filtering + +Filters can also be added from the log message details by clicking the `+` (add to search) or `–` (exclude from search) buttons. + +Coroot Log Filtering diff --git a/docs/docs/metrics/_category_.yaml b/docs/docs/metrics/_category_.yaml new file mode 100644 index 000000000..be94445ec --- /dev/null +++ b/docs/docs/metrics/_category_.yaml @@ -0,0 +1,8 @@ +--- +label: Metrics +position: 15 +link: + type: generated-index + slug: /metrics/ + + diff --git a/docs/docs/metrics/cluster-agent.md b/docs/docs/metrics/cluster-agent.md new file mode 100644 index 000000000..0004a1746 --- /dev/null +++ b/docs/docs/metrics/cluster-agent.md @@ -0,0 +1,148 @@ +--- +sidebar_position: 2 +toc_max_heading_level: 2 +--- + +# Cluster-agent + +This page describes metrics gathered by [coroot-cluster-agent](https://github.com/coroot/coroot-cluster-agent). + +Coroot-cluster-agent is a dedicated tool for collecting cluster-wide telemetry data: + * It gathers database metrics by discovering databases through Coroot's Service Map and Kubernetes control-plane. +Using the credentials provided by Coroot or via Kubernetes annotations, the agent connects to the identified databases such as Postgres, MySQL, Redis, Memcached, and MongoDB, collects database-specific metrics, and sends them to Coroot using the Prometheus Remote Write protocol. + * The agent can be integrated with AWS to discover RDS and ElastiCache clusters and collect their telemetry data. + * The agent discovers and scrapes [custom metrics](/metrics/custom-metrics) from annotated pods. + +## Postgres + +### pg_up +* **Description**: Whether the Postgres server is reachable or not +* **Type**: Gauge +* **Source**: The agent checks that a connection to the server is still alive on each scrape + +### pg_probe_seconds +* **Description**: How long it took to execute an empty SQL query (`;`) on the server. This metric shows the round-trip time between the agent and the server +* **Type**: Gauge +* **Source**: The time spent executing `db.Ping()` + +### pg_info +* **Description**: The server info +* **Type**: Gauge +* **Source**: [`pg_settings.server_version`](https://www.postgresql.org/docs/current/view-pg-settings.html) +* **Labels**: server_version + +### pg_setting +* **Description**: Value of the pg_setting variable +* **Type**: Gauge +* **Source**: [`pg_settings`](https://www.postgresql.org/docs/current/view-pg-settings.html). The agent only collects variables of the following types: `integer`, `real` and `bool` +* **Labels**: name, unit + +### pg_connections +* **Description**: The number of the database connections +* **Type**: Gauge +* **Source**: [`pg_stat_activity`](https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW) +* **Labels**: + * db + * user + * state: current state of the connection, < active | idle | idle in transaction > + * wait_event_type: [type](https://www.postgresql.org/docs/current/monitoring-stats.html#WAIT-EVENT-TABLE) of event that the connection is waiting for. + * query - If the state of a connection is `active`, this is the currently executing query. + For `idle in transaction` connections, this is the last executed query. This label holds a normalized and obfuscated query. + +### pg_latency_seconds +* **Description**: Query execution time +* **Type**: Gauge +* **Source**: [`pg_stat_activity`](https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW), [`pg_stat_statements`](https://www.postgresql.org/docs/current/pgstatstatements.html) +* **Labels**: + * summary: < avg | max | p50 | p75 | p95 | p99 > + +### pg_db_queries_per_second +* **Description**: Number of queries executed in the database +* **Type**: Gauge +* **Source**: Aggregation of `pg_stat_activity.state = 'Active'` and `pg_stat_statements.calls` +* **Labels**: db + +### pg_lock_awaiting_queries +* **Description**: Number of queries awaiting a lock +* **Type**: Gauge +* **Source**: Number of connections with `pg_stat_activity.wait_event_type = 'Lock'`. +The `blocking_query` label is calculated using the [pg_blocking_pids](https://www.postgresql.org/docs/current/functions-info.html) function +* **Labels**: db, user, blocking_query (the query holding the lock) + +### Query Metrics + +The [pg_stat_statements](https://www.postgresql.org/docs/current/pgstatstatements.html) view shows statistics only for queries that have been completed. +So, to provide comprehensive statistics, the agent extends this with data about the currently active queries from the [pg_stat_activity](https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-ACTIVITY-VIEW) view. + +Collecting stats about each query would produce metrics with very high cardinality. +However, the primary purpose of such metrics is to show the most resource-consuming queries. +So, the agent collects these metrics only for TOP-20 queries by total execution time. + + +Each metric described below has `query`, `db` and `user` labels. +`Query` is a normalized and obfuscated query from pg_stat_statements.query, and pg_stat_activity.query. + +For example, the following queries: + +```sql +SELECT * FROM tbl WHERE id='1'; +SELECT * FROM tbl WHERE id='2'; +``` + +will be grouped to + +```sql +SELECT * FROM tbl WHERE id=?; +``` + +### pg_top_query_calls_per_second +* **Description**: Number of times the query has been executed +* **Type**: Gauge +* **Source**: `pg_stat_statements.calls` and `pg_stat_activity.state = 'Active'` +* **Labels**: db, user, query + +### pg_top_query_time_per_second +* **Description**: Time spent executing the query +* **Type**: Gauge +* **Source**: `clock_timestamp()-pg_stat_activity.query_start` and `pg_stat_statements.total_time` +* **Labels**: db, user, query + +### pg_top_query_io_time_per_second +* **Description**: Time the query spent awaiting I/O +* **Type**: Gauge +* **Source**: `pg_stat_activity.wait_event_type = 'IO'`, `pg_stat_statements.blk_read_time` and `pg_stat_statements.blk_write_time` +* **Labels**: db, user, query + +### pg_top_query_io_time_per_second +* **Description**: Time the query spent awaiting I/O +* **Type**: Gauge +* **Source**: `pg_stat_activity.wait_event_type = 'IO'`, `pg_stat_statements.blk_read_time` and `pg_stat_statements.blk_write_time` +* **Labels**: db, user, query + +### Replication metrics + +### pg_wal_receiver_status +* **Description**: WAL receiver status: 1 if the receiver is connected, otherwise 0 +* **Type**: Gauge +* **Source**: `pg_stat_wal_receiver` and `pg_settings[primary_conninfo]` +* **Labels**: sender_host, sender_port + +### pg_wal_replay_paused +* **Description**: Whether WAL replay paused or not +* **Type**: Gauge +* **Source**: `pg_is_wal_replay_paused()` or `pg_is_xlog_replay_paused()` + +### pg_wal_current_lsn +* **Description**: Current WAL sequence number +* **Type**: Counter +* **Source**: `pg_current_wal_lsn()` or `pg_current_xlog_location()` + +### pg_wal_receive_lsn +* **Description**: WAL sequence number that has been received and synced to disk by streaming replication. +* **Type**: Counter +* **Source**: `pg_last_wal_receive_lsn()` or `pg_last_xlog_receive_location()` + +### pg_wal_reply_lsn +* **Description**: WAL sequence number that has been replayed during recovery +* **Type**: Counter +* **Source**: `pg_last_wal_replay_lsn()` or `pg_last_xlog_replay_location()` diff --git a/docs/docs/metrics/custom-metrics.md b/docs/docs/metrics/custom-metrics.md new file mode 100644 index 000000000..06ce0ecd4 --- /dev/null +++ b/docs/docs/metrics/custom-metrics.md @@ -0,0 +1,28 @@ +--- +sidebar_position: 3 +toc_max_heading_level: 2 +--- + +# Custom metrics + +Coroot-cluster-agent can scrape custom metrics exposed by an application in the Prometheus format. +So far, it supports only Kubernetes service discovery. + +## Kubernetes service discovery + +If a pod exposes metrics on a specific endpoint (like `/metrics`), you can annotate the pod to enable scraping by coroot-cluster-agent. + +For example, to enable metrics scraping, add the following annotations to your pod: + +```yaml +metadata: + annotations: + coroot.com/scrape-metrics: 'true' + coroot.com/metrics-port: '8080' + coroot.com/metrics-path: '/metrics' # optional + coroot.com/metrics-scheme: 'http' # optional +``` + +This configuration tells coroot-cluster-agent to scrape metrics from port `8080` and the `/metrics` path. + +Each scraped metric will be annotated with the `pod` and `namespace` labels, allowing you to filter and aggregate metrics efficiently. diff --git a/docs/docs/metrics/node-agent.md b/docs/docs/metrics/node-agent.md new file mode 100644 index 000000000..9025f11b7 --- /dev/null +++ b/docs/docs/metrics/node-agent.md @@ -0,0 +1,639 @@ +--- +sidebar_position: 1 +toc_max_heading_level: 2 +--- + +# Node-agent + +This page describes metrics gathered by [coroot-node-agent](https://github.com/coroot/coroot-node-agent). + +Each container metric has the `container_id` label. This is a compound identifier and its format varies between container types, e.g., +`/docker/upbeat_borg`, `k8s/namespace-1/pod-2/container-3` or `/system.slice/nginx.service`. + +## CPU + +### container_resources_cpu_limit_cores +* **Description**: CPU limit of the container +* **Type**: Gauge +* **Source**: [CPU](https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt) cgroup, the `cpu.cfs_quota_us` and `cpu.cfs_period_us` files + +### container_resources_cpu_usage_seconds_total +* **Description**: Total CPU time consumed by the container +* **Type**: Counter +* **Source**: [CPU accounting](https://www.kernel.org/doc/Documentation/cgroup-v1/cpuacct.txt) cgroup, the `cpuacct.usage` file + +### container_resources_cpu_throttled_seconds_total +* **Description**: Total time duration the container has been throttled for +* **Type**: Counter +* **Source**: [CPU cgroup](https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt), the `cpu.stat` file + +### container_resources_cpu_delay_seconds_total +* **Description**: Total time duration the container has been waiting for a CPU (while being runnable) +* **Type**: Counter +* **Source**: [Delay accounting](https://www.kernel.org/doc/html/latest/accounting/delay-accounting.html) + +## Memory + +### container_resources_memory_limit_bytes +* **Description**: Memory limit of the container +* **Type**: Gauge +* **Source**: [Memory](https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt) cgroup, file `memory.limit_in_bytes` + +### container_resources_memory_rss_bytes +* **Description**: Amount of physical memory used by the container (doesn't include page cache) +* **Type**: Gauge +* **Source**: [Memory](https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt) cgroup, file `memory.stats` + +### container_resources_memory_cache_bytes +* **Description**: Amount of page cache memory allocated by the container +* **Type**: Gauge +* **Source**: [Memory](https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt) cgroup, file `memory.stats` + +### container_oom_kills_total +* **Description**: Total number of times the container has been terminated by the OOM killer +* **Type**: Counter +* **Source**: eBPF: tracepoint/oom/mark_victim + +## Disk + +### container_resources_disk_delay_seconds_total +* **Description**: Total time duration the container has been waiting for I/Os to complete +* **Type**: Counter +* **Source**: [Delay accounting](https://www.kernel.org/doc/html/latest/accounting/delay-accounting.html) + +### container_resources_disk_size_bytes +* **Description**: Total capacity of the volume +* **Type**: Gauge +* **Source**: [statfs()](https://man7.org/linux/man-pages/man2/statfs.2.html) +* **Labels**: + * **mount_point** - path in the mount namespace of the container + * **device** - device name, e.g., `vda`, `nvme1n1` + * **volume** - [Persistent Volume](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) (Kubernetes only) + +### container_resources_disk_used_bytes +* **Description**: Used capacity of the volume. +* **Type**: Gauge +* **Source**: [statfs()](https://man7.org/linux/man-pages/man2/statfs.2.html) +* **Labels**: mount_point, device, volume + + +### container_resources_disk_(reads|writes)_total +* **Description**: Total number of reads or writes completed successfully by the container +* **Type**: Counter +* **Source**: [Blkio](https://www.kernel.org/doc/Documentation/cgroup-v1/blkio-controller.txt) cgroup, the `blkio.throttle.io_serviced` file +* **Labels**: mount_point, device, volume + +### container_resources_disk_(read|written)_bytes_total +* **Description**: Total number of bytes read from the disk or written to the disk by the container +* **Type**: Counter +* **Source**: [Blkio](https://www.kernel.org/doc/Documentation/cgroup-v1/blkio-controller.txt) cgroup, the `blkio.throttle.io_service_bytes` file +* **Labels**: mount_point, device, volume + +## GPU + +### container_resources_gpu_usage_percent +* **Description**: Percent of GPU compute resources used by the container +* **Type**: Gauge +* **Source**: NVIDIA Management Library (NVML) +* **Labels**: gpu_uuid + +### container_resources_gpu_memory_usage_percent +* **Description**: Percent of GPU memory used by the container +* **Type**: Gauge +* **Source**: NVIDIA Management Library (NVML) +* **Labels**: gpu_uuid + +## Network + +### container_net_tcp_listen_info +* **Description**: A TCP listen address of the container +* **Type**: Gauge +* **Source**: eBPF: `tracepoint/sock/inet_sock_set_state`, `/proc//net/tcp`, `/proc//net/tcp6` +* **Labels**: listen_addr (ip:port), proxy + +### container_net_tcp_successful_connects_total +* **Description**: Total number of successful TCP connection attempts +* **Type**: Counter +* **Source**: eBPF: `tracepoint/sock/inet_sock_set_state` +* **Labels**: `destination`, `actual_destination`. The IP and port of the connection’s destination. For example, a container might be establishing a connection to port 80 of a Kubernetes Service IP (e.g., 10.96.1.1). This destination address may be translated by iptables to the actual Pod IP (e.g., 10.40.1.5). In this case, the actual_destination would be 10.40.1.5:80. + +### container_net_tcp_retransmits_total +* **Description**: Total number of retransmitted TCP segments. This metric is collected only for outbound TCP connections. +* **Type**: Counter +* **Source**: eBPF: `tracepoint/tcp/tcp_retransmit_skb` +* **Labels**: `destination`, `actual_destination` + +### container_net_tcp_failed_connects_total +* **Description**: Total number of failed TCP connects to a particular endpoint. The agent takes into account only TCP failures, so this metric doesn't reflect DNS errors +* **Type**: Counter +* **Source**: eBPF: `tracepoint/sock/inet_sock_set_state` +* **Labels**: `destination` + +### container_net_tcp_active_connections +* **Description**: Number of active outbound connections between the container and a particular endpoint +* **Type**: Gauge +* **Source**: eBPF: `tracepoint/sock/inet_sock_set_state` +* **Labels**: `destination`, `actual_destination` + +### container_net_latency_seconds +* **Description**: Round-trip time between the container and a remote IP +* **Type**: Gauge +* **Source**: The agent measures the round-trip time of an ICMP request sent to IP addresses the container is currently working with +* **Labels**: `destination_ip` + +### container_net_latency_seconds +* **Description**: Round-trip time between the container and a remote IP +* **Type**: Gauge +* **Source**: The agent measures the round-trip time of an ICMP request sent to IP addresses the container is currently working with +* **Labels**: `destination_ip` + +## Application layer protocol metrics + +### container_http_requests_total +* **Description**: Total number of outbound HTTP requests made by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_http_requests_duration_seconds_total +* **Description**: Histogram of the response time for each outbound HTTP request +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_postgres_queries_total +* **Description**: Total number of outbound Postgres queries made by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_postgres_queries_duration_seconds_total +* **Description**: Histogram of the response time for each outbound Postgres query +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_redis_queries_total +* **Description**: Total number of outbound Redis queries made by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_redis_queries_duration_seconds_total +* **Description**: Histogram of the response time for each outbound Redis query +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_memcached_queries_total +* **Description**: Total number of outbound Memcached queries made by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_memcached_queries_duration_seconds_total +* **Description**: Histogram of the response time for each outbound Memcached query +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_mysql_queries_total +* **Description**: Total number of outbound Mysql queries made by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_mysql_queries_duration_seconds_total +* **Description**: Histogram of the response time for each outbound Mysql query +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_mongo_queries_total +* **Description**: Total number of outbound Mongo queries made by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_mongo_queries_duration_seconds_total +* **Description**: Histogram of the response time for each outbound Mongo query +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_kafka_requests_total +* **Description**: Total number of outbound Kafka requests made by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_kafka_requests_duration_seconds_total +* **Description**: Histogram of the response time for each outbound Kafka request +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_cassandra_queries_total +* **Description**: Total number of outbound Cassandra queries made by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_cassandra_queries_duration_seconds_total +* **Description**: Histogram of the response time for each outbound Cassandra query +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_rabbitmq_messages_total +* **Description**: Total number of Rabbitmq messages produced or consumed by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status`, `method` + +### container_nats_messages_total +* **Description**: Total number of NATS messages produced or consumed by the container +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `method` + +### container_dubbo_requests_total +* **Description**: Total number of outbound DUBBO requests +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_dubbo_requests_duration_seconds_total +* **Description**: Histogram of the response time for each outbound DUBBO request +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_dns_requests_total +* **Description**: Total number of outbound DNS requests +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `domain`, `request_type`, `status` + +### container_dns_requests_duration_seconds_total +* **Description**: Histogram of the response time for each outbound DNS request +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `le` + +### container_clickhouse_requests_total +* **Description**: Total number of outbound ClickHouse queries +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_clickhouse_requests_duration_seconds_total +* **Description**: Histogram of the response time for each outbound ClickHouse query +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +### container_zookeeper_requests_total +* **Description**: Total number of outbound ZooKeeper requests +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `status` + +### container_zookeeper_requests_duration_seconds_total +* **Description**: Histogram of the response time for each outbound ZooKeeper request +* **Type**: Counter +* **Source**: eBPF +* **Labels**: `destination`, `actual_destination`, `le` + +## JVM + +Each JVM metric has the `jvm` label which refers to the main class or path to the `.jar` file. + +### container_jvm_info +* **Description**: Meta information about the JVM +* **Type**: Gauge +* **Source**: `hsperfdata` +* **Labels**: `jvm`, `java_version` + +### container_jvm_heap_size_bytes +* **Description**: Total heap size in bytes +* **Type**: Gauge +* **Source**: `hsperfdata` +* **Labels**: `jvm` + +### container_jvm_heap_used_bytes +* **Description**: Used heap size in bytes +* **Type**: Gauge +* **Source**: `hsperfdata` +* **Labels**: `jvm` + +### container_jvm_gc_time_seconds +* **Description**: Time spent in the given JVM garbage collector in seconds +* **Type**: Counter +* **Source**: `hsperfdata` +* **Labels**: `jvm`, `gc` + +### container_jvm_safepoint_time_seconds +* **Description**: Time the application has been stopped for safepoint operations in seconds +* **Type**: Counter +* **Source**: `hsperfdata` +* **Labels**: `jvm` + +### container_jvm_safepoint_sync_time_seconds +* **Description**: Time spent getting to safepoints in seconds +* **Type**: Counter +* **Source**: `hsperfdata` +* **Labels**: `jvm` + +## .NET runtime + +Each .NET runtime metric has the `application` label, which allows distinguishing multiple applications within the same container. + +### container_dotnet_info +* **Description**: Meta information about the Common Language Runtime (CLR) +* **Type**: Gauge +* **Source**: .NET diagnostic port +* **Labels**: `application`, `runtime_version` + +### container_dotnet_memory_allocated_bytes_total +* **Description**: The number of bytes allocated +* **Type**: Counter +* **Source**: .NET diagnostic port +* **Labels**: `application` + +### container_dotnet_exceptions_total +* **Description**: The number of exceptions that have occurred +* **Type**: Counter +* **Source**: .NET diagnostic port +* **Labels**: `application` + +### container_dotnet_memory_heap_size_bytes +* **Description**: Total size of the heap generation in bytes +* **Type**: Gauge +* **Source**: .NET diagnostic port +* **Labels**: `application`, `generation` + +### container_dotnet_gc_count_total +* **Description**: The number of times GC has occurred for the generation +* **Type**: Counter +* **Source**: .NET diagnostic port +* **Labels**: `application`, `generation` + +### container_dotnet_heap_fragmentation_percent +* **Description**: The heap fragmentation +* **Type**: Gauge +* **Source**: .NET diagnostic port +* **Labels**: `application` + +### container_dotnet_monitor_lock_contentions_total +* **Description**: The number of times there was contention when trying to take the monitor's lock +* **Type**: Gauge +* **Source**: .NET diagnostic port +* **Labels**: `application` + +### container_dotnet_thread_pool_completed_items_total +* **Description**: The number of work items that have been processed in the ThreadPool +* **Type**: Counter +* **Source**: .NET diagnostic port +* **Labels**: `application` + +### container_dotnet_thread_pool_queue_length +* **Description**: The number of work items that are currently queued to be processed in the ThreadPool +* **Type**: Gauge +* **Source**: .NET diagnostic port +* **Labels**: `application` + +### container_dotnet_thread_pool_size +* **Description**: The number of thread pool threads that currently exist in the ThreadPool +* **Type**: Gauge +* **Source**: .NET diagnostic port +* **Labels**: `application` + +## Other + +### container_info +* **Description**: Meta information about the container +* **Type**: Gauge +* **Source**: dockerd, containerd +* **Labels**: `image` + +### container_restarts_total +* **Description**: Number of times the container has been restarted +* **Type**: Counter +* **Source**: eBPF: `tracepoint/task/task_newtask`, `tracepoint/sched/sched_process_exit` + +### container_application_type +* **Description**: Type of application running in the container (e.g., memcached, postgres, mysql) +* **Type**: Gauge +* **Source**: `/proc//cmdline` of the processes running within the container +* **Labels**: `application_type` + +## Logs + +### container_log_messages_total +* **Description**: The number of messages grouped by the automatically extracted repeated patterns +* **Type**: Counter +* **Source**: The container's log. The following logging methods are supported: + * stdout/stderr: streams are captured by Dockerd (json file driver) or Containerd (CRI) + * Journald + * `/var/log/*` +* **Labels**: + * source: `journald`, `stdout/stderr`, or path to the file in the `/var/log` directory. + * level: < unknown | debug | info | warning | error | critical > + * pattern_hash: the ID of the automatically extracted repeated pattern + * sample: a sample message of the group + +## Node metrics + +### node_resources_cpu_usage_seconds_total +* **Description**: Amount of CPU time spent in each mode +* **Type**: Counter +* **Source**: `/proc/stat` +* **Labels**: mode: < user | nice | system | idle | iowait | irq | softirq | steal > + +### node_resources_cpu_logical_cores +* **Description**: Number of logical CPU cores +* **Type**: Counter +* **Source**: `/proc/stat` + +### node_resources_memory_total_bytes +* **Description**: Total amount of physical memory +* **Type**: Gauge +* **Source**: `/proc/meminfo` + +### node_resources_memory_free_bytes +* **Description**: Amount of unassigned memory +* **Type**: Gauge +* **Source**: `/proc/meminfo` + +### node_resources_memory_available_bytes +* **Description**: An estimate of how much memory is available for allocations, without swapping. +Roughly speaking, this is the sum of the `free` memory and a part of the `page cache` that can be reclaimed. +You can learn more about how this estimate is calculated [here](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=34e431b0ae398fc54ea69ff85ec700722c9da773) +* **Type**: Gauge +* **Source**: `/proc/meminfo` + +### node_resources_memory_cached_bytes +* **Description**: Amount of memory used as [page cache](https://en.wikipedia.org/wiki/Page_cache). The memory used for page cache might be reclaimed on memory pressure. This can increase the number of disk reads +* **Type**: Gauge +* **Source**: `/proc/meminfo` + +### node_resources_disk_(reads|writes)_total +* **Description**: Total number of reads or writes completed successfully. +Any disk has the maximum IOPS it can serve. Below are the reference values for the different storage types: + +| Type | Max IOPS | +|-------------------------------|--------------| +| Amazon EBS sc1 | 250 | +| Amazon EBS st1 | 500 | +| Amazon EBS gp2/gp3 | 16,000 | +| Amazon EBS io1/io2 | 64,000 | +| Amazon EBS io2 Block Express | 256,000 | +| HDD | 200 | +| SATA SSD | 100,000 | +| NVMe SSD | 10,000,000 | +* **Type**: Counter +* **Source**: `/proc/diskstats` +* **Labels**: device + +### node_resources_disk_(read|written)_bytes_total +* **Description**: Total number of bytes read from the disk or written to the disk respectively +In additional to the maximum number of IOPS a disk can serve, there is a throughput limit. For example, + +| Type | Max throughput | +|-------------------------------|----------------| +| Amazon EBS sc1 | 250 MB/s | +| Amazon EBS st1 | 500 MB/s | +| Amazon EBS gp2 | 250 MB/s | +| Amazon EBS gp3 | 1,000 MB/s | +| Amazon EBS io1/io2 | 1,000 MB/s | +| Amazon EBS io2 Block Express | 4,000 MB/s | +| SATA | 600 MB/s | +| SAS | 1,200 MB/s | +| NVMe | 4,000 MB/s | + +* **Type**: Counter +* **Source**: `/proc/diskstats` +* **Labels**: device + +### node_resources_disk_(read|write)_time_seconds_total +* **Description**: Total number of seconds spent reading and writing respectively, including queue wait. +To get the average I/O latency, the sum of these two should be normalized by the number of the executed I/O requests. +Below is the reference average I/O latency for the different storage types: + +| Type | Avg latency | +|-------------------------------|--------------------------| +| Amazon EBS gp2/gp3/io1/io2 | "single-digit millisecond" | +| Amazon EBS io2 Block Express | "sub-millisecond" | +| HDD | 2–4ms | +| NVMe SSD | 0.1–0.3ms | + +* **Type**: Counter +* **Source**: `/proc/diskstats` +* **Labels**: device + +### node_resources_disk_io_time_seconds_total +* **Description**: Total number of seconds the disk spent doing I/O. It doesn't include queue wait, only service time. +E.g., if the derivative of this metric for a minute interval is 60s, this means that the disk was busy 100% of that interval. +* **Type**: Counter +* **Source**: `/proc/diskstats` +* **Labels**: device + +### node_net_received_(bytes|packets)_total +* **Description**: Total number of bytes and packets received +* **Type**: Counter +* **Labels**: interface + +### node_net_transmitted_(bytes|packets)_total +* **Description**: Total number of bytes and packets transmitted +* **Type**: Counter +* **Labels**: interface + +### node_net_interface_up +* **Description**: Status of the interface (0: down, 1:up) +* **Type**: Gauge +* **Labels**: interface + +### node_net_interface_ip +* **Description**: IP address assigned to the interface +* **Type**: Gauge +* **Labels**: interface, ip + +### node_net_interface_ip +* **Description**: IP address assigned to the interface +* **Type**: Gauge +* **Labels**: interface, ip + +### node_gpu_info +* **Description**: Meta information about the GPU +* **Type**: Gauge +* **Labels**: gpu_uuid, name + +### node_resources_gpu_memory_total_bytes +* **Description**: Total memory available on the GPU in bytes +* **Type**: Gauge +* **Labels**: gpu_uuid + +### node_resources_gpu_memory_used_bytes +* **Description**: GPU memory currently in use in bytes +* **Type**: Gauge +* **Labels**: gpu_uuid + +### node_resources_gpu_memory_utilization_percent_avg +* **Description**: Average GPU memory utilization (percentage) over the collection interval +* **Type**: Gauge +* **Labels**: gpu_uuid + +### node_resources_gpu_memory_utilization_percent_peak +* **Description**: Peak GPU memory utilization (percentage) over the collection interval +* **Type**: Gauge +* **Labels**: gpu_uuid + +### node_resources_gpu_utilization_percent_avg +* **Description**: Average GPU core utilization (percentage) over the collection interval +* **Type**: Gauge +* **Labels**: gpu_uuid + +### node_resources_gpu_utilization_percent_peak +* **Description**: Peak GPU core utilization (percentage) over the collection interval +* **Type**: Gauge +* **Labels**: gpu_uuid + +### node_resources_gpu_temperature_celsius +* **Description**: Current temperature of the GPU in Celsius +* **Type**: Gauge +* **Labels**: gpu_uuid + +### node_resources_gpu_power_usage_watts +* **Description**: Current power usage of the GPU in watts +* **Type**: Gauge +* **Labels**: gpu_uuid + +### node_uptime_seconds +* **Description**: Uptime of the node in seconds +* **Type**: Gauge + +### node_info +* **Description**: Meta information about the node +* **Type**: Gauge +* **Labels**: hostname, kernel_version, agent_version + +### node_cloud_info +* **Description**: Meta information about the cloud instance +* **Type**: Gauge +* **Source**: The agent detects the cloud provider using [sysfs](https://man7.org/linux/man-pages/man5/sysfs.5.html). +Then it uses cloud-specific metadata services to retrieve additional information about the instance [AWS](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), +[GCP](https://cloud.google.com/compute/docs/metadata/overview), [Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/instance-metadata-service?tabs=linux), +Hetzner, Scaleway, DigitalOcean, Alibaba. +For unsupported providers, you can use the `--provider`, `--region`, and `--availability-zone` command line arguments of the agent to define the labels manually. +* **Labels**: + * provider: < aws | gcp, azure | hetzner | scaleway | digitalocean | alibaba > + * account_id: `account_id` for AWS, `project_id` for GCP, `subscription_id` for azure + * instance_id + * instance_type + * instance_life_cycle: < on-demand | spot | preemtible > (always empty for Azure instances) + * region + * availability_zone + * availability_zone_id: [ID](https://docs.aws.amazon.com/ram/latest/userguide/working-with-az-ids.html) of the availability zone (AWS only) + * local_ipv4 + * public_ipv4 diff --git a/docs/docs/misc/_category_.yaml b/docs/docs/misc/_category_.yaml index 6789c4274..332585ce1 100644 --- a/docs/docs/misc/_category_.yaml +++ b/docs/docs/misc/_category_.yaml @@ -1,6 +1,6 @@ --- label: Misc -position: 11 +position: 20 link: type: generated-index slug: /misc/ diff --git a/docs/docs/profiling/ebpf-based-profiling.md b/docs/docs/profiling/ebpf-based-profiling.md new file mode 100644 index 000000000..31cc5603f --- /dev/null +++ b/docs/docs/profiling/ebpf-based-profiling.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 2 +--- + +# eBPF-based profiling + +Coroot’s agent includes a built-in eBPF-based CPU profiler. It continuously profiles all processes running on a node, +associates them with container metadata, and sends the results to the collector. + +In most cases, the profiler works out of the box with no configuration. +However, for certain runtimes, additional integration steps can improve symbolization quality. + +## Java + +For JVM-based applications, accurate stack traces require exposing JIT-compiled symbols. +Coroot supports this automatically, but the JVM must be started with the following flag: + +```bash +-XX:+PreserveFramePointer +``` + +When this flag is set, Coroot’s agent will detect it and periodically invoke the JVM to dump the perf map file (once per minute). +This works seamlessly with containerized applications. + +## Node.js + +Node.js also supports generating perf map files. To enable it, start the Node.js process with the following options: + +```bash +--perf-basic-prof-only-functions --interpreted-frames-native-stack +``` + +With these flags, the Node.js process will maintain the perf map file automatically. +Coroot’s agent will detect and use it to improve symbolization. + +## Disabling profiling for specific applications + +To exclude specific applications from eBPF-based profiling, set the following environment variable for the process: + +```bash +COROOT_EBPF_PROFILING=disabled +``` + +Coroot checks the /proc/<pid>/environ file for each process and skips profiling when this variable is set. + + diff --git a/docs/docs/quick-start/enterprise-edition.md b/docs/docs/quick-start/enterprise-edition.md index 9f316f86f..db36948fd 100644 --- a/docs/docs/quick-start/enterprise-edition.md +++ b/docs/docs/quick-start/enterprise-edition.md @@ -53,7 +53,7 @@ To deploy Coroot using Docker Compose, run the following command. Before applying it, you can review the configuration file in Coroot's GitHub repository: docker-compose.yaml ```bash -curl -fsS https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/docker-compose.yaml | \ +curl -fsS https://raw.githubusercontent.com/coroot/coroot/main/deploy/docker-compose.yaml | \ LICENSE_KEY="COROOT-LICENSE-KEY-HERE" docker compose -f - up -d ``` @@ -68,8 +68,8 @@ Deploy the Coroot stack to your cluster by running the following command on the Before applying, you can review the configuration file in Coroot's GitHub repository: docker-swarm-stack.yaml ```bash -curl -fsS https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/docker-swarm-stack.yaml | \ - LICENSE_KEY="COROOT-LICENSE-KEY-HERE" docker stack deploy -c - coroot-ee +curl -fsS https://raw.githubusercontent.com/coroot/coroot/main/deploy/docker-swarm-stack.yaml | \ + LICENSE_KEY="COROOT-LICENSE-KEY-HERE" docker stack deploy -c - coroot ``` Since Docker Swarm doesn't support privileged containers, you'll have to manually deploy coroot-node-agent on each cluster node. @@ -97,7 +97,7 @@ For detailed steps on installing all the necessary components on an Ubuntu/Debia To install Coroot, run the following command: ```bash -curl -sfL https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/install.sh | \ +curl -sfL https://raw.githubusercontent.com/coroot/coroot/main/deploy/install.sh | \ LICENSE_KEY="COROOT-LICENSE-KEY-HERE" \ BOOTSTRAP_PROMETHEUS_URL="http://127.0.0.1:9090" \ BOOTSTRAP_REFRESH_INTERVAL=15s \ @@ -123,7 +123,7 @@ For detailed steps on installing all the necessary components on an Ubuntu/Debia To install Coroot, run the following command: ```bash -curl -sfL https://raw.githubusercontent.com/coroot/coroot-ee/main/deploy/install.sh | \ +curl -sfL https://raw.githubusercontent.com/coroot/coroot/main/deploy/install.sh | \ LICENSE_KEY="COROOT-LICENSE-KEY-HERE" \ BOOTSTRAP_PROMETHEUS_URL="http://127.0.0.1:9090" \ BOOTSTRAP_REFRESH_INTERVAL=15s \ diff --git a/docs/docs/risks/_category_.yaml b/docs/docs/risks/_category_.yaml new file mode 100644 index 000000000..80cbaa4d1 --- /dev/null +++ b/docs/docs/risks/_category_.yaml @@ -0,0 +1,8 @@ +--- +label: Risk Monitoring +position: 11 +link: + type: generated-index + slug: /risks/ + + diff --git a/docs/docs/risks/overview.md b/docs/docs/risks/overview.md new file mode 100644 index 000000000..e39b79c4d --- /dev/null +++ b/docs/docs/risks/overview.md @@ -0,0 +1,136 @@ +--- +sidebar_position: 1 +--- + +# Overview + +Risk management is a big part of how SREs think about reliability. It’s not about fixing every possible issue, it’s about constantly asking “what if?” and being prepared for things that could go wrong. + +Some risks are fine to tolerate, maybe they’re low impact, unlikely to happen, too expensive to fix, or just not a priority right now. +Others are quick wins and worth addressing. +But as systems grow more complex and change rapidly, it becomes hard to track risks manually. That’s where automation helps. + +Coroot Risk Monitoring automatically detects availability and some security risks across your infrastructure. + +Risks monitoring + + +## Availability + +Availability risks are potential situations that can lead to service unavailability or even data loss. +Coroot uses a model of your system to simulate failure scenarios and identify weak spots. + +Below are the currently supported scenarios that Coroot validates for each application: + +### Single-instance application + +Single-instance application + +Even in Kubernetes, a node failure can temporarily make a service unavailable. +It takes some time for the control plane to detect the failure and reschedule pods. During this period, your app may be unreachable. + +To avoid excessive noise, Coroot doesn’t trigger this risk for: +* Apps that don’t communicate with other services +* Single-node clusters + +You can also dismiss the risk manually if needed. + +### All instances on one node + +All instances on one node + +If your app has multiple replicas, they might all end up scheduled on the same node. +If that node fails, your service will go down despite having multiple instances. + +Coroot excludes this risk for: +* Single-node clusters +* Standalone applications + +### All instances in one Availability Zone + +All instances in one Availability Zone + +To survive an Availability Zone (AZ) failure, important applications should have instances spread across multiple AZs. + +Running in a single AZ is a valid trade-off in many cases, cross-AZ setups can increase latency and data transfer costs. +So Coroot only evaluates this risk if your cluster spans multiple AZs. + +### All instances on Spot nodes + +All instances on Spot nodes + +Spot nodes are cheaper and increasingly used even for user-facing services. +However, they can be terminated at any time with little notice, so your app must be resilient to that. + +A common pattern is to mix Spot and On-Demand nodes. This way, even if Spot instances are lost, On-Demand instances can keep the app running. + +Coroot flags this risk only if your cluster includes On-Demand nodes. +For Spot-only clusters, Coroot assumes the setup is intentional and doesn’t report this as a risk. + +### Unreplicated databases + +For stateful apps like databases, the failure of a node using local storage can result in data loss. +If the database uses network-attached storage like AWS EBS, the volume can be reattached to another node, but this takes time. + +EBS volumes are highly durable (AWS claims 99.999% durability), but reattachment delays can still impact availability. + +To mitigate these risks: +- Use backups to prevent data loss +- Use replication to reduce downtime + +:::warning +Replication isn’t a replacement for backups, accidental deletions or unexpected changes will be copied to all replicas. +::: + +Coroot can't currently verify backups but can detect whether a database is replicated. +It checks whether the database service has multiple instances or communicates with another DB (implying replication). + +Unreplicated databases + +This doesn't validate replication health or data consistency, but it's a useful starting point for identifying at-risk databases. + +As always, you can dismiss this risk for any database with one click. + +## Security + +:::warning +Currently, Coroot validates only one security risk, so don't consider it as a replacement for other Security audit tools. +::: + +### Publicly Exposed Databases + +Publicly Exposed Databases + + +Since Coroot automatically detects the type of every application or container, it can distinguish between database servers and stateless apps. +It supports a wide range of open-source databases, including PostgreSQL, MySQL, Redis (and its alternatives), Memcached, MongoDB, Elasticsearch, +OpenSearch, ClickHouse, Prometheus, VictoriaMetrics, Kafka, RabbitMQ, and more. + +However, databases accepting connections on public IPs are only part of the problem. +On Kubernetes, services can be exposed through a NodePort or LoadBalancer, making them accessible from the internet. +Coroot already collects data about Kubernetes Services, so we’ve covered those scenarios as well. + +Of course, some databases are intentionally exposed, for example, when access is controlled via firewalls, AWS Security Groups, +or built-in database security mechanisms. If that’s the case, you can simply dismiss the risk. + +## Dismissing risks + +If a risk isn’t relevant, you can dismiss it by clicking the three-dot menu next to the risk: + +Dismiss + +Once dismissed, the risk will be hidden from the main list but still recorded. +To view dismissed risks, enable the **Show dismissed** checkbox at the top of the page: + +Show dismissed risks + +Dismissed risks are shown in a lighter gray color and include a note with the dismissal reason and timestamp. For example: + +> _Dismissed by Admin (2025-05-22 14:09:03) as "tolerable for this project"_ + +This makes it easy to track which risks were reviewed and why they were dismissed, helping ensure transparency and accountability in decision-making. + +You can re-enable any dismissed risk at any time by clicking the same three-dot menu and selecting **Mark as Active**. + +Mark as active + diff --git a/docs/docs/tracing/opentelemetry-java.md b/docs/docs/tracing/opentelemetry-java.md index 6c8990d6e..830c53ab0 100644 --- a/docs/docs/tracing/opentelemetry-java.md +++ b/docs/docs/tracing/opentelemetry-java.md @@ -49,6 +49,7 @@ export \ OTEL_SERVICE_NAME="spring-demo" \ OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="http://coroot.coroot:8080/v1/traces" \ OTEL_EXPORTER_OTLP_TRACES_PROTOCOL="http/protobuf" \ + OTEL_LOGS_EXPORTER="none" \ OTEL_METRICS_EXPORTER="none" \ && java -javaagent:./opentelemetry-javaagent.jar -jar build/libs/demo-0.0.1-SNAPSHOT.jar ``` diff --git a/docs/src/css/custom.css b/docs/src/css/custom.css index 1aefcad6e..8615f36a6 100644 --- a/docs/src/css/custom.css +++ b/docs/src/css/custom.css @@ -15,6 +15,7 @@ --ifm-color-primary-lightest: #3cad6e; --ifm-code-font-size: 95%; --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); + --ifm-card-border-radius: 0.4rem; } /* For readability concerns, you should choose a lighter palette in dark mode. */ @@ -29,14 +30,25 @@ --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3); } +[data-theme='dark'] .markdown img { + filter: brightness(90%); +} + .card { - border: 1px solid #ddd; /* Light gray border */ - border-radius: 4px; /* Rounded corners */ - box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); /* Subtle shadow */ - padding: 16px; /* Padding around the image */ - margin: 16px 0 16px 0; - background-color: #fff; /* Optional: white background for contrast */ - transition: transform 0.2s ease; /* Smooth zoom effect on hover */ + border: 1px solid #ddd; + padding: 16px; + margin: 4px; +} + +.markdown img { + border-radius: 4px; + padding: 16px; + background-color: white; +} + +.markdown img.card { + border: 1px solid #ddd; + margin-bottom: 16px; } .w-1200 { @@ -53,14 +65,11 @@ .horizontal-images { display: flex; - justify-content: space-between; - gap: 20px; - margin: 16px 0 16px 0; + gap: 16px; } .horizontal-images img { - width: 48%; - height: auto; + flex-basis: 50%; } .primary-button { @@ -73,4 +82,4 @@ } .primary-button:hover { background-color: #2958FD; -} \ No newline at end of file +} diff --git a/docs/src/theme/DocItem/Footer/index.tsx b/docs/src/theme/DocItem/Footer/index.tsx index d61d7aa9d..a112aa703 100644 --- a/docs/src/theme/DocItem/Footer/index.tsx +++ b/docs/src/theme/DocItem/Footer/index.tsx @@ -1,5 +1,8 @@ +// @ts-ignore import React from 'react'; +// @ts-ignore import Footer from '@theme-original/DocItem/Footer'; +// @ts-ignore import type FooterType from '@theme/DocItem/Footer'; import type { WrapperProps } from '@docusaurus/types'; @@ -9,7 +12,7 @@ export default function FooterWrapper(props: Props): JSX.Element { return ( <>