diff --git a/docs/admin/code-hosts/aws-codecommit.mdx b/docs/admin/code-hosts/aws-codecommit.mdx
index 45bb54f32..0e68adef1 100644
--- a/docs/admin/code-hosts/aws-codecommit.mdx
+++ b/docs/admin/code-hosts/aws-codecommit.mdx
@@ -37,7 +37,7 @@ AWS CodeCommit connections support the following configuration options, which ar
{/* SCHEMA_SYNC_START: admin/code_hosts/aws_codecommit.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
// REQUIRED:
diff --git a/docs/admin/code-hosts/azuredevops.mdx b/docs/admin/code-hosts/azuredevops.mdx
index 08f4c56e2..3e2b37db0 100644
--- a/docs/admin/code-hosts/azuredevops.mdx
+++ b/docs/admin/code-hosts/azuredevops.mdx
@@ -65,7 +65,7 @@ Azure DevOps connections support the following configuration options, which are
{/* SCHEMA_SYNC_START: admin/code_hosts/azuredevops.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
// Authentication alternatives: token OR windowsPassword
diff --git a/docs/admin/code-hosts/bitbucket-cloud.mdx b/docs/admin/code-hosts/bitbucket-cloud.mdx
index 8349f8ef6..2b62143cc 100644
--- a/docs/admin/code-hosts/bitbucket-cloud.mdx
+++ b/docs/admin/code-hosts/bitbucket-cloud.mdx
@@ -116,7 +116,7 @@ Bitbucket Cloud connections support the following configuration options, which a
{/* SCHEMA_SYNC_START: admin/code_hosts/bitbucket_cloud.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
// The workspace access token to use when authenticating with Bitbucket Cloud.
diff --git a/docs/admin/code-hosts/bitbucket-server.mdx b/docs/admin/code-hosts/bitbucket-server.mdx
index 01f1bd272..035e6b4ce 100644
--- a/docs/admin/code-hosts/bitbucket-server.mdx
+++ b/docs/admin/code-hosts/bitbucket-server.mdx
@@ -202,7 +202,7 @@ Bitbucket Server / Bitbucket Data Center connections support the following confi
{/* SCHEMA_SYNC_START: admin/code_hosts/bitbucket_server.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
// Authentication alternatives: token OR password
diff --git a/docs/admin/code-hosts/gerrit.mdx b/docs/admin/code-hosts/gerrit.mdx
index 651b5093a..25db5a4ef 100644
--- a/docs/admin/code-hosts/gerrit.mdx
+++ b/docs/admin/code-hosts/gerrit.mdx
@@ -113,7 +113,7 @@ Gerrit connections support the following configuration options, which are specif
{/* SCHEMA_SYNC_START: admin/code_hosts/gerrit.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
// If non-null, enforces Gerrit repository permissions. This requires that there is an item in the [site configuration json](https://sourcegraph.com/docs/admin/config/site_config#auth-providers) `auth.providers` field, of type "gerrit" with the same `url` field as specified in this `GerritConnection`.
diff --git a/docs/admin/code-hosts/github.mdx b/docs/admin/code-hosts/github.mdx
index 777eedc88..e461ad3a3 100644
--- a/docs/admin/code-hosts/github.mdx
+++ b/docs/admin/code-hosts/github.mdx
@@ -454,7 +454,7 @@ GitHub connections support the following configuration options, which are specif
{/* SCHEMA_SYNC_START: admin/code_hosts/github.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
// Authentication alternatives: token OR gitHubAppDetails OR externalAccount OR useRandomExternalAccount
diff --git a/docs/admin/code-hosts/gitlab.mdx b/docs/admin/code-hosts/gitlab.mdx
index e125d1fee..e331cee12 100644
--- a/docs/admin/code-hosts/gitlab.mdx
+++ b/docs/admin/code-hosts/gitlab.mdx
@@ -189,7 +189,7 @@ See [Internal rate limits](/admin/code-hosts/rate-limits#internal-rate-limits).
{/* SCHEMA_SYNC_START: admin/code_hosts/gitlab.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
// If non-null, enforces GitLab repository permissions. This requires that there be an item in the `auth.providers` field of type "gitlab" with the same `url` field as specified in this `GitLabConnection`.
diff --git a/docs/admin/code-hosts/gitolite.mdx b/docs/admin/code-hosts/gitolite.mdx
index d085e26ed..c9d61a59f 100644
--- a/docs/admin/code-hosts/gitolite.mdx
+++ b/docs/admin/code-hosts/gitolite.mdx
@@ -25,7 +25,7 @@ To connect Gitolite to Sourcegraph:
{/* SCHEMA_SYNC_START: admin/code_hosts/gitolite.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
// A list of repositories to never mirror from this Gitolite instance. Supports excluding by exact name ({"name": "foo"}).
diff --git a/docs/admin/code-hosts/other.mdx b/docs/admin/code-hosts/other.mdx
index cb0ee85c5..bf95ea9b6 100644
--- a/docs/admin/code-hosts/other.mdx
+++ b/docs/admin/code-hosts/other.mdx
@@ -68,7 +68,7 @@ Repositories must be listed individually:
{/* SCHEMA_SYNC_START: admin/code_hosts/other_external_service.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
// A list of repositories to never mirror by name after applying repositoryPathPattern. Supports excluding by exact name ({"name": "myrepo"}) or regular expression ({"pattern": ".*secret.*"}).
diff --git a/docs/admin/code-hosts/phabricator.mdx b/docs/admin/code-hosts/phabricator.mdx
index 5e05fdfaa..f2c50a6dd 100644
--- a/docs/admin/code-hosts/phabricator.mdx
+++ b/docs/admin/code-hosts/phabricator.mdx
@@ -76,7 +76,7 @@ The Sourcegraph instance's site admin must [update the `corsOrigin` site config
{/* SCHEMA_SYNC_START: admin/code_hosts/phabricator.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
// SSH cipher to use when cloning via SSH. Must be a valid choice from `ssh -Q cipher`.
diff --git a/docs/admin/config/settings.mdx b/docs/admin/config/settings.mdx
index b0347f255..181d46e5d 100644
--- a/docs/admin/config/settings.mdx
+++ b/docs/admin/config/settings.mdx
@@ -27,7 +27,7 @@ Settings options and their default values are shown below.
{/* SCHEMA_SYNC_START: admin/config/settings.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
diff --git a/docs/admin/config/site-config.mdx b/docs/admin/config/site-config.mdx
index 36d8d9fe5..0750f3a89 100644
--- a/docs/admin/config/site-config.mdx
+++ b/docs/admin/config/site-config.mdx
@@ -21,7 +21,7 @@ All site configuration options and their default values are shown below.
{/* SCHEMA_SYNC_START: admin/config/site.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
@@ -448,6 +448,9 @@ All site configuration options and their default values are shown below.
// DEPRECATED: Configure maxRepos in search.limits
"maxReposToSearch": -1,
+ // Enable/disable MCP API endpoints under `/.api/mcp`. When disabled, MCP endpoints return 404.
+ "mcp.enabled": true,
+
"modelConfiguration": null,
// Notifications received from Sourcegraph.com to display in Sourcegraph.
@@ -648,7 +651,8 @@ All site configuration options and their default values are shown below.
"tls.external": null,
- // The channel on which to automatically check for Sourcegraph updates.
+ // ⚠️ DEPRECATED: DEPRECATED: This setting has no effect.
+ // DEPRECATED: This setting has no effect.
// Valid options: "release", "none"
// Other example values:
// - "none"
@@ -737,7 +741,7 @@ All site configuration options and their default values are shown below.
// When enabled, users are required to connect at least one external account to their Sourcegraph account. Site admins are exempt from this requirement.
"auth.enforceExternalAccountConnection": false,
- // Enables OAuth 2.0 Dynamic Client Registration (RFC 7591) for the Sourcegraph identity provider. When enabled, OAuth clients can self-register programmatically instead of requiring manual pre-configuration. Required for MCP clients and other applications that use dynamic registration for authorization.
+ // Enables OAuth 2.0 Dynamic Client Registration (RFC 7591) for the Sourcegraph identity provider. When enabled, OAuth clients can self-register programmatically instead of requiring manual pre-configuration. This is used for MCP clients; if `mcp.enabled` is `false`, this setting is treated as `false`.
"auth.idpDynamicClientRegistrationEnabled": true,
// The config options for account lockout
diff --git a/docs/admin/repo/perforce.mdx b/docs/admin/repo/perforce.mdx
index 55a6d31e4..524089737 100644
--- a/docs/admin/repo/perforce.mdx
+++ b/docs/admin/repo/perforce.mdx
@@ -228,7 +228,7 @@ With this setting, Sourcegraph will ignore any rules with a host other than `*`,
{/* SCHEMA_SYNC_START: admin/code_hosts/perforce.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-27T11:45:04Z */}
```json
{
// If non-null, enforces Perforce depot permissions.
diff --git a/docs/admin/telemetry/private-metadata-allowlist.mdx b/docs/admin/telemetry/private-metadata-allowlist.mdx
index a887e247f..609fe90f9 100644
--- a/docs/admin/telemetry/private-metadata-allowlist.mdx
+++ b/docs/admin/telemetry/private-metadata-allowlist.mdx
@@ -33,4 +33,5 @@ To learn more, refer to the [telemetry documentation](https://sourcegraph.com/do
| `cody.modelSelector` | _(all)_ | `modelId` — High-cardinality model identifier; helpful for determining the model selected in the model selector.
`modelProvider` — High-cardinality model provider; helpful for determining the model selected in the model selector. |
| `cody.smart-apply.context` | `applied` | `model` — High-cardinality model identifier; helpful for determining the model that was selected. |
| `deepsearch` | `search.toolcall` | `toolName` — High-cardinality tool name; helpful for determining which tools are being used during deep search.
`toolId` — High-cardinality tool identifier; helpful for determining which tools are being used during deep search.
`model` — High-cardinality model identifier; helpful for determining which models are being used during deep search. |
+| `admin.users` | `delete` | `userIDs` — Numeric identifiers of users being deleted; needed for audit and analytics of admin user management actions. |
| `externalApi` | `request` | `procedure` — ConnectRPC procedure path (e.g. '/sourcegraph.users.v1.UsersService/GetUser'); not sensitive and needed to distinguish which external API RPCs are being used. |
\ No newline at end of file
diff --git a/docs/cli/references/index.mdx b/docs/cli/references/index.mdx
index c511872ba..5f606371d 100644
--- a/docs/cli/references/index.mdx
+++ b/docs/cli/references/index.mdx
@@ -15,11 +15,11 @@
* [`lsp`](references/lsp)
* [`orgs`](references/orgs)
* [`repos`](references/repos)
-* [`sbom` (deprecated)](references/sbom)
+* [`sbom`](references/sbom)
* [`search`](references/search)
* [`search-jobs`](references/search-jobs)
* [`serve-git`](references/serve-git)
-* [`signature` (deprecated)](references/signature)
+* [`signature`](references/signature)
* [`snapshot`](references/snapshot)
* [`teams`](references/teams)
* [`users`](references/users)
diff --git a/docs/cli/references/sbom.mdx b/docs/cli/references/sbom.mdx
index e3750cf6a..76dd910b3 100644
--- a/docs/cli/references/sbom.mdx
+++ b/docs/cli/references/sbom.mdx
@@ -1,13 +1,11 @@
# `src sbom`
-
+## frontend: goroutine_error_percentage_long_window
+
+
percentage of failed periodic goroutine executions over a long window
+ +**Descriptions** + +- warning frontend: 30%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s +- critical frontend: 50%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s + +**Next steps** + +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#frontend-goroutine_error_percentage_long_window). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_goroutine_error_percentage_long_window", + "critical_frontend_goroutine_error_percentage_long_window" +] +``` + +*Managed by the Sourcegraph Services team.* + +mean blocked seconds per conn request
@@ -1386,37 +1423,6 @@ Generated query for critical alert: `min(((src_gitserver_disk_space_available /container CPU throttling time %
- -**Descriptions** - -- warning gitserver: 75%+ container CPU throttling time % for 2m0s - -**Next steps** - -- - Consider increasing the CPU limit for the container. -- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-cpu_throttling_time). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_gitserver_cpu_throttling_time" -] -``` - -*Managed by the Sourcegraph Services team.* - -echo test command duration
@@ -1513,6 +1519,99 @@ Generated query for warning alert: `max((sum(src_gitserver_clone_queue)) >= 2CPU usage
+ +**Descriptions** + +- warning gitserver: 95%+ CPU usage for 10m0s + +**Next steps** + +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-cpu_usage_percentage). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_cpu_usage_percentage" +] +``` + +*Managed by the Sourcegraph Services team.* + +memory (RSS)
+ +**Descriptions** + +- warning gitserver: 90%+ memory (RSS) for 10m0s + +**Next steps** + +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-memory_rss). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_memory_rss" +] +``` + +*Managed by the Sourcegraph Services team.* + +container CPU throttling time %
+ +**Descriptions** + +- warning gitserver: 75%+ container CPU throttling time % for 2m0s + +**Next steps** + +- Consider increasing the CPU limit for the container. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-cpu_throttling_time). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_cpu_throttling_time" +] +``` + +*Managed by the Sourcegraph Services team.* + +rate of git command corruption retry attempts over 5m
@@ -1613,23 +1712,27 @@ Generated query for warning alert: `max((sum by (name, job_name) (rate(src_perioCPU usage
+percentage of failed periodic goroutine executions over a long window
**Descriptions** -- warning gitserver: 95%+ CPU usage for 10m0s +- warning gitserver: 30%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s +- critical gitserver: 50%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s **Next steps** -- Consider increasing CPU limits or scaling out. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-cpu_usage_percentage). +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-goroutine_error_percentage_long_window). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_cpu_usage_percentage" + "warning_gitserver_goroutine_error_percentage_long_window", + "critical_gitserver_goroutine_error_percentage_long_window" ] ``` @@ -1638,38 +1741,9 @@ Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periomemory (RSS)
- -**Descriptions** +Generated query for warning alert: `max(((sum by (name, job_name) (increase(src_periodic_goroutine_errors_total{job=~".*gitserver.*"\}[6h])) / clamp_min(sum by (name, job_name) (increase(src_periodic_goroutine_total\{job=~".*gitserver.*"}[6h])), 1)) * 100) >= 30)` -- warning gitserver: 90%+ memory (RSS) for 10m0s - -**Next steps** - -- Consider increasing memory limits or scaling out. -- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-memory_rss). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_gitserver_memory_rss" -] -``` - -*Managed by the Sourcegraph Services team.* - -percentage of failed periodic goroutine executions over a long window
+ +**Descriptions** + +- warning worker: 30%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s +- critical worker: 50%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s + +**Next steps** + +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-goroutine_error_percentage_long_window). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_worker_goroutine_error_percentage_long_window", + "critical_worker_goroutine_error_percentage_long_window" +] +``` + +*Managed by the Sourcegraph Services team.* + +mean blocked seconds per conn request
@@ -4488,6 +4599,43 @@ Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periopercentage of failed periodic goroutine executions over a long window
+ +**Descriptions** + +- warning searcher: 30%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s +- critical searcher: 50%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s + +**Next steps** + +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-goroutine_error_percentage_long_window). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_searcher_goroutine_error_percentage_long_window", + "critical_searcher_goroutine_error_percentage_long_window" +] +``` + +*Managed by the Sourcegraph Services team.* + +mean blocked seconds per conn request
diff --git a/docs/self-hosted/observability/dashboards.mdx b/docs/self-hosted/observability/dashboards.mdx index dce0c86d8..c31f05a89 100644 --- a/docs/self-hosted/observability/dashboards.mdx +++ b/docs/self-hosted/observability/dashboards.mdx @@ -4539,6 +4539,32 @@ sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*frontePercentage of failed periodic goroutine executions over a long window
+ +The percentage of failed executions over the last 6 hours for each periodic goroutine. +A value above 30% sustained for at least 3 hours indicates persistent failures. +A value above 50% sustained for at least 3 hours indicates that most executions are failing continuously. + +Refer to the [alerts reference](alerts#frontend-goroutine_error_percentage_long_window) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102920` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile handler execution time
@@ -4548,7 +4574,7 @@ Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102920` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102930` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4573,7 +4599,7 @@ This represents how long a complete loop iteration takes before sleeping for the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102921` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102931` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4598,7 +4624,7 @@ Higher values indicate that tenant processing is taking longer and may affect ov This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102930` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102940` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4623,7 +4649,7 @@ Consistently high values might indicate problematic tenants or inefficient proce This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102931` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102941` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4648,7 +4674,7 @@ Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102940` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102950` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4673,7 +4699,7 @@ A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102941` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102951` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4698,7 +4724,7 @@ Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102950` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102960` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4723,7 +4749,7 @@ Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102951` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102961` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4993,7 +5019,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^(frontend|sourcegraph-Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#frontend-memory_rss) for 1 alert related to this panel. @@ -6025,66 +6051,22 @@ Query:Container CPU throttling time %
- -- A high value indicates that the container is spending too much time waiting for CPU cycles. - -Refer to the [alerts reference](alerts#gitserver-cpu_throttling_time) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100010` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Cpu usage seconds
- -- This value should not exceed 75% of the CPU limit over a longer period of time. - - We cannot alert on this as we don`t know the resource allocation. - - - If this value is high for a longer time, consider increasing the CPU limit for the container. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100011` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Number of git commands that exceeded the threshold for high memory usage
-#### gitserver: memory_major_page_faults +This graph tracks the number of git subcommands that gitserver ran that exceeded the threshold for high memory usage. +This graph in itself is not an alert, but it is used to learn about the memory usage of gitserver. -Gitserver page faults
+If gitserver frequently serves requests where the status code is KILLED, this graph might help to correlate that +with the high memory usage. -The number of major page faults in a 5 minute window for gitserver. If this number increases significantly, it indicates that more git API calls need to load data from disk. There may not be enough memory to efficiently support the amount of API requests served concurrently. +This graph spiking is not a problem necessarily. But when subcommands or the whole gitserver service are getting +OOM killed and this graph shows spikes, increasing the memory might be useful. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100020` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100010` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6094,28 +6076,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10002 Query: ``` -rate(container_memory_failures_total{failure_type="pgmajfault", name=~"^gitserver.*"}[5m]) +sort_desc(sum(sum_over_time(src_gitserver_exec_high_memory_usage_count{instance=~`${shard:regex}`}[2m])) by (cmd)) ```Number of git commands that exceeded the threshold for high memory usage
- -This graph tracks the number of git subcommands that gitserver ran that exceeded the threshold for high memory usage. -This graph in itself is not an alert, but it is used to learn about the memory usage of gitserver. +#### gitserver: running_git_commands -If gitserver frequently serves requests where the status code is KILLED, this graph might help to correlate that -with the high memory usage. +Git commands running on each gitserver instance
-This graph spiking is not a problem necessarily. But when subcommands or the whole gitserver service are getting -OOM killed and this graph shows spikes, increasing the memory might be useful. +A high value signals load. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100021` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100011` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6125,21 +6100,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10002 Query: ``` -sort_desc(sum(sum_over_time(src_gitserver_exec_high_memory_usage_count{instance=~`${shard:regex}`}[2m])) by (cmd)) +sum by (instance, cmd) (src_gitserver_exec_running{instance=~`${shard:regex}`}) ```Git commands running on each gitserver instance
+Rate of git commands received
-A high value signals load. +per second rate per command This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100030` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100012` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6149,21 +6124,22 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10003 Query: ``` -sum by (instance, cmd) (src_gitserver_exec_running{instance=~`${shard:regex}`}) +sum by (cmd) (rate(src_gitserver_exec_duration_seconds_count{instance=~`${shard:regex}`}[5m])) ```Rate of git commands received
+Git command CPU usage seconds by requester scope
-per second rate per command +CPU time consumed by git subcommands, grouped by propagated requester scope and CPU kind. +Use this to identify high-CPU callers and whether time is spent in user or system CPU. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100031` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100013` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6173,7 +6149,7 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10003 Query: ``` -sum by (cmd) (rate(src_gitserver_exec_duration_seconds_count{instance=~`${shard:regex}`}[5m])) +topk(20, sum by (scope, kind) (rate(src_gitserver_exec_cpu_seconds_total{instance=~`${shard:regex}`}[5m]))) ``` @@ -6187,7 +6163,7 @@ sum by (cmd) (rate(src_gitserver_exec_duration_seconds_count{instance=~`${shard: Refer to the [alerts reference](alerts#gitserver-echo_command_duration_test) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100040` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100020` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6213,7 +6189,7 @@ by recloning repositories, but this may take a while depending on repo size. Refer to the [alerts reference](alerts#gitserver-repo_corrupted) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100041` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100021` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6235,7 +6211,7 @@ sum(rate(src_gitserver_repo_corrupted[5m])) Refer to the [alerts reference](alerts#gitserver-repository_clone_queue_size) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100050` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100030` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6261,7 +6237,7 @@ It does not indicate any problems with the instance, but can give a good indicat This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100051` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100031` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6277,15 +6253,26 @@ sum by (job, instance) (src_gitserver_client_concurrent_requests)95th percentile gitservice request duration per shard
+A trend to watch out for: when something in-application happens to take a lot of memory, and active file previously used nearly all remaining memory, then: -A high value means any internal service trying to clone a repo from gitserver is slowed down. +1. 'Memory (RSS)' goes up, due to in-application usage +2. 'Memory usage (Active file)' goes down, as file data held in memory is evicted +3. 'Page faults' go up, as less data is held in memory (and with that, IOPS, disk read throughput, ...) -This panel has no related alerts. +This can also happen without 'Memory (RSS)' increasing, if the provisioned memory is insufficent to start with. +A small degree of this is behaviour generally expected, but if it happens significantly or causes user-noticeable impact, it's likely gitserver could benefit from more memory. Look for more user-facing metrics to make a final determination on appropriate resource allocation. + +_See https://en.wikipedia.org/wiki/Memory-mapped_file and the related articles for more information about memory maps._ + +#### gitserver: cpu_usage_percentage + +CPU usage
+ +Refer to the [alerts reference](alerts#gitserver-cpu_usage_percentage) for 1 alert related to this panel. To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100100` on your Sourcegraph instance. @@ -6297,17 +6284,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10010 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{instance=~`${shard:regex}`}[5m])) by (le, gitservice)) +cadvisor_container_cpu_usage_percentage_total{name=~"^gitserver.*"} ```Gitservice request rate per shard
+Memory usage percentage (total)
-Per shard gitservice request rate +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. @@ -6321,17 +6308,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10010 Query: ``` -sum(rate(src_gitserver_gitservice_duration_seconds_count{instance=~`${shard:regex}`}[5m])) by (gitservice) +cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"} ```Gitservice requests running per shard
+Memory usage bytes (total)
-Per shard gitservice requests running +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. @@ -6345,23 +6332,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10010 Query: ``` -sum(src_gitserver_gitservice_running{instance=~`${shard:regex}`}) by (gitservice) +max by (name) (container_memory_working_set_bytes{name=~"^gitserver.*"}) ```Total housekeeping tasks by type and status
+Memory (RSS)
-The rate of housekeeping tasks performed in repositories, broken down by task type and success/failure status +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. -This panel has no related alerts. +Refer to the [alerts reference](alerts#gitserver-memory_rss) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100110` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6371,21 +6356,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10020 Query: ``` -sum(rate(src_gitserver_janitor_tasks_total{instance=~`${shard:regex}`}[5m])) by (housekeeping_task, status) +max(container_memory_rss{name=~"^gitserver.*"} / container_spec_memory_limit_bytes{name=~"^gitserver.*"}) by (name) * 100.0 ```90th percentile latency of successful tasks by type over 5m
+Memory usage (active file)
-The 90th percentile latency of successful housekeeping tasks, broken down by task type +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100111` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6395,21 +6380,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10021 Query: ``` -histogram_quantile(0.90, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="success"}[5m])) by (le, housekeeping_task)) +max(container_memory_total_active_file_bytes{name=~"^gitserver.*"} / container_spec_memory_limit_bytes{name=~"^gitserver.*"}) by (name) * 100.0 ```95th percentile latency of successful tasks by type over 5m
+Memory usage (kernel)
-The 95th percentile latency of successful housekeeping tasks, broken down by task type +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100112` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6419,21 +6404,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10021 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="success"}[5m])) by (le, housekeeping_task)) +max(container_memory_kernel_usage{name=~"^gitserver.*"} / container_spec_memory_limit_bytes{name=~"^gitserver.*"}) by (name) * 100.0 ```99th percentile latency of successful tasks by type over 5m
+Gitserver page faults
-The 99th percentile latency of successful housekeeping tasks, broken down by task type +The number of major page faults in a 5 minute window for gitserver. If this number increases significantly, it indicates that more git API calls need to load data from disk. There may not be enough memory to efficiently support the amount of API requests served concurrently. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100120` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6443,21 +6428,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10021 Query: ``` -histogram_quantile(0.99, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="success"}[5m])) by (le, housekeeping_task)) +rate(container_memory_failures_total{failure_type="pgmajfault", name=~"^gitserver.*"}[5m]) ```90th percentile latency of failed tasks by type over 5m
+Container CPU throttling time %
-The 90th percentile latency of failed housekeeping tasks, broken down by task type +A high value indicates that the container is spending too much time waiting for CPU cycles. -This panel has no related alerts. +Refer to the [alerts reference](alerts#gitserver-cpu_throttling_time) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100220` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100130` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6467,21 +6452,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10022 Query: ``` -histogram_quantile(0.90, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="failure"}[5m])) by (le, housekeeping_task)) +sum by (container_label_io_kubernetes_pod_name) ((rate(container_cpu_cfs_throttled_periods_total{container_label_io_kubernetes_container_name="gitserver", container_label_io_kubernetes_pod_name=~`${shard:regex}`}[5m]) / rate(container_cpu_cfs_periods_total{container_label_io_kubernetes_container_name="gitserver", container_label_io_kubernetes_pod_name=~`${shard:regex}`}[5m])) * 100) ```95th percentile latency of failed tasks by type over 5m
+Cpu usage seconds
-The 95th percentile latency of failed housekeeping tasks, broken down by task type +- This value should not exceed 75% of the CPU limit over a longer period of time. +- We cannot alert on this as we don`t know the resource allocation. +- If this value is high for a longer time, consider increasing the CPU limit for the container. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100221` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100131` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6491,21 +6478,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10022 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="failure"}[5m])) by (le, housekeeping_task)) +sum by (container_label_io_kubernetes_pod_name) (rate(container_cpu_usage_seconds_total{container_label_io_kubernetes_container_name="gitserver", container_label_io_kubernetes_pod_name=~`${shard:regex}`}[5m])) ```99th percentile latency of failed tasks by type over 5m
+#### gitserver: gitservice_request_duration -The 99th percentile latency of failed housekeeping tasks, broken down by task type +95th percentile gitservice request duration per shard
+ +A high value means any internal service trying to clone a repo from gitserver is slowed down. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100222` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100200` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6515,21 +6504,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10022 Query: ``` -histogram_quantile(0.99, sum(rate(src_gitserver_janitor_tasks_latency_bucket{instance=~`${shard:regex}`, status="failure"}[5m])) by (le, housekeeping_task)) +histogram_quantile(0.95, sum(rate(src_gitserver_gitservice_duration_seconds_bucket{instance=~`${shard:regex}`}[5m])) by (le, gitservice)) ```Files pruned by type over 5m
+Gitservice request rate per shard
-The rate of files pruned during cleanup, broken down by file type +Per shard gitservice request rate This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100230` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100201` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6539,21 +6528,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10023 Query: ``` -sum(rate(src_gitserver_janitor_pruned_files_total{instance=~`${shard:regex}`}[5m])) by (filetype) +sum(rate(src_gitserver_gitservice_duration_seconds_count{instance=~`${shard:regex}`}[5m])) by (gitservice) ```Data structure counts over 5m
+Gitservice requests running per shard
-The count distribution of various Git data structures in repositories +Per shard gitservice requests running This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100240` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100202` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6563,21 +6552,23 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10024 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_janitor_data_structure_count_bucket{instance=~`${shard:regex}`}[5m])) by (le, data_structure)) +sum(src_gitserver_gitservice_running{instance=~`${shard:regex}`}) by (gitservice) ```Data structure sizes
+#### gitserver: janitor_tasks_total -The size distribution of various Git data structures in repositories +Total housekeeping tasks by type and status
+ +The rate of housekeeping tasks performed in repositories, broken down by task type and success/failure status This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100250` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100300` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6587,21 +6578,237 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10025 Query: ``` -histogram_quantile(0.95, sum(rate(src_gitserver_janitor_data_structure_size_bucket{instance=~`${shard:regex}`}[5m])) by (le, data_structure)) +sum(rate(src_gitserver_janitor_tasks_total{instance=~`${shard:regex}`}[5m])) by (housekeeping_task, status) ```Time since last optimization
+90th percentile latency of successful tasks by type over 5m
-The time elapsed since last optimization of various Git data structures +The 90th percentile latency of successful housekeeping tasks, broken down by task type + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100310` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile latency of successful tasks by type over 5m
+ +The 95th percentile latency of successful housekeeping tasks, broken down by task type + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100311` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +99th percentile latency of successful tasks by type over 5m
+ +The 99th percentile latency of successful housekeeping tasks, broken down by task type + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100312` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +90th percentile latency of failed tasks by type over 5m
+ +The 90th percentile latency of failed housekeeping tasks, broken down by task type + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100320` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile latency of failed tasks by type over 5m
+ +The 95th percentile latency of failed housekeeping tasks, broken down by task type + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100321` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +99th percentile latency of failed tasks by type over 5m
+ +The 99th percentile latency of failed housekeeping tasks, broken down by task type + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100322` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Files pruned by type over 5m
+ +The rate of files pruned during cleanup, broken down by file type + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100330` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Data structure counts over 5m
+ +The count distribution of various Git data structures in repositories + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100340` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Data structure sizes
+ +The size distribution of various Git data structures in repositories + +This panel has no related alerts. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100350` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Time since last optimization
+ +The time elapsed since last optimization of various Git data structures This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100260` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100360` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6625,7 +6832,7 @@ The rate at which data structures are reported to exist in repositories This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100270` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100370` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6653,7 +6860,7 @@ This metric helps track how often the retry mechanism is triggered. Refer to the [alerts reference](alerts#gitserver-git_command_retry_attempts_rate) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100400` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6678,7 +6885,7 @@ This indicates how effective the retry mechanism is at resolving transient corru This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100401` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6703,7 +6910,7 @@ These failures will result in repository corruption marking and potential reclon This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100410` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6729,7 +6936,7 @@ Common causes include network issues, permission changes, or concurrent reposito This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100411` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6755,7 +6962,7 @@ A low ratio may indicate persistent corruption issues requiring investigation. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100412` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6782,7 +6989,7 @@ A value of 0 indicates the routine isn`t running currently, it awaits it`s next This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100500` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6807,7 +7014,7 @@ A low or zero value could indicate that a routine is stalled or encountering err This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100501` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6832,7 +7039,7 @@ A sustained high error rate may indicate a problem with the routine`s configurat Refer to the [alerts reference](alerts#gitserver-goroutine_error_rate) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100510` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6857,7 +7064,7 @@ A value above 5% indicates that a significant portion of routine executions are Refer to the [alerts reference](alerts#gitserver-goroutine_error_percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100511` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6873,6 +7080,32 @@ sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*gitserPercentage of failed periodic goroutine executions over a long window
+ +The percentage of failed executions over the last 6 hours for each periodic goroutine. +A value above 30% sustained for at least 3 hours indicates persistent failures. +A value above 50% sustained for at least 3 hours indicates that most executions are failing continuously. + +Refer to the [alerts reference](alerts#gitserver-goroutine_error_percentage_long_window) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100520` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile handler execution time
@@ -6882,7 +7115,7 @@ Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100420` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100530` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6907,7 +7140,7 @@ This represents how long a complete loop iteration takes before sleeping for the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100421` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100531` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6932,7 +7165,7 @@ Higher values indicate that tenant processing is taking longer and may affect ov This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100430` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100540` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6957,7 +7190,7 @@ Consistently high values might indicate problematic tenants or inefficient proce This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100431` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100541` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6982,7 +7215,7 @@ Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100440` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100550` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -7007,7 +7240,7 @@ A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100441` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100551` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -7032,7 +7265,7 @@ Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100450` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100560` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -7057,7 +7290,7 @@ Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100451` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100561` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -7073,159 +7306,15 @@ Query:CPU usage
+Transmission rate over 5m (aggregate)
-Refer to the [alerts reference](alerts#gitserver-cpu_usage_percentage) for 1 alert related to this panel. +The rate of bytes sent over the network across all pods -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100500` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory usage percentage (total)
- -An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100501` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory usage bytes (total)
- -An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100502` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory (RSS)
- -The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." - -Refer to the [alerts reference](alerts#gitserver-memory_rss) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100510` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory usage (active file)
- -This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100511` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory usage (kernel)
- -The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100512` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Transmission rate over 5m (aggregate)
- -The rate of bytes sent over the network across all pods - -This panel has no related alerts. +This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100600` on your Sourcegraph instance. @@ -17130,6 +17219,32 @@ sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*workerPercentage of failed periodic goroutine executions over a long window
+ +The percentage of failed executions over the last 6 hours for each periodic goroutine. +A value above 30% sustained for at least 3 hours indicates persistent failures. +A value above 50% sustained for at least 3 hours indicates that most executions are failing continuously. + +Refer to the [alerts reference](alerts#worker-goroutine_error_percentage_long_window) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101820` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile handler execution time
@@ -17139,7 +17254,7 @@ Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101820` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101830` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17164,7 +17279,7 @@ This represents how long a complete loop iteration takes before sleeping for the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101821` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101831` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17189,7 +17304,7 @@ Higher values indicate that tenant processing is taking longer and may affect ov This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101830` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101840` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17214,7 +17329,7 @@ Consistently high values might indicate problematic tenants or inefficient proce This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101831` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101841` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17239,7 +17354,7 @@ Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101840` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101850` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17264,7 +17379,7 @@ A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101841` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101851` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17289,7 +17404,7 @@ Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101850` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101860` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17314,7 +17429,7 @@ Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101851` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101861` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17584,7 +17699,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^worker.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#worker-memory_rss) for 1 alert related to this panel. @@ -20833,6 +20948,32 @@ sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*searchPercentage of failed periodic goroutine executions over a long window
+ +The percentage of failed executions over the last 6 hours for each periodic goroutine. +A value above 30% sustained for at least 3 hours indicates persistent failures. +A value above 50% sustained for at least 3 hours indicates that most executions are failing continuously. + +Refer to the [alerts reference](alerts#searcher-goroutine_error_percentage_long_window) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101420` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile handler execution time
@@ -20842,7 +20983,7 @@ Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101420` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101430` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20867,7 +21008,7 @@ This represents how long a complete loop iteration takes before sleeping for the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101421` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101431` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20892,7 +21033,7 @@ Higher values indicate that tenant processing is taking longer and may affect ov This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101430` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101440` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20917,7 +21058,7 @@ Consistently high values might indicate problematic tenants or inefficient proce This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101431` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101441` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20942,7 +21083,7 @@ Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101440` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101450` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20967,7 +21108,7 @@ A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101441` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101451` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20992,7 +21133,7 @@ Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101450` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101460` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -21017,7 +21158,7 @@ Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101451` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101461` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -21287,7 +21428,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^searcher.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#searcher-memory_rss) for 1 alert related to this panel. @@ -21815,7 +21956,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^syntect-server.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#syntect-server-memory_rss) for 1 alert related to this panel. @@ -22369,7 +22510,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^zoekt-indexserver.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#zoekt-memory_rss) for 1 alert related to this panel. @@ -22439,6 +22580,19 @@ max(container_memory_kernel_usage{name=~"^zoekt-indexserver.*"} / container_spec ### Zoekt: Zoekt-webserver (CPU, Memory) +Zoekt web server leverages memory mapping to optimize file reads: it is generally expected to consume all the memory provided to it, if it can. When it finds data that is not available in memory yet, this causes a 'page fault', and the data is loaded into memory from disk. + +A trend to watch out for: when something in-application happens to take a lot of memory, and active file previously used nearly all remaining memory, then: + +1. 'Memory (RSS)' goes up, due to in-application usage +2. 'Memory usage (Active file)' goes down, as file data held in memory is evicted +3. 'Page faults' go up, as less data is held in memory (and with that, IOPS, disk read throughput, ...) + +This can also happen without 'Memory (RSS)' increasing, if the provisioned memory is insufficent to start with. +A small degree of this is behaviour generally expected, but if it happens significantly or causes user-noticeable impact, it's likely zoekt web server could benefit from more memory. Look for more user-facing metrics to make a final determination on appropriate resource allocation. + +_See https://en.wikipedia.org/wiki/Memory-mapped_file and the related articles for more information about memory maps._ + #### zoekt: cpu_usage_percentageCPU usage
@@ -22513,7 +22667,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^zoekt-webserver.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#zoekt-memory_rss) for 1 alert related to this panel. @@ -31886,1547 +32040,7 @@ This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101011` on your Sourcegraph instance. -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation errors every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101012` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation error rate over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101013` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Records scanned every 5m
- -The number of candidate records considered for cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101100` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Records altered every 5m
- -The number of candidate records altered as part of cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101101` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operations every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101110` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -99th percentile successful job invocation operation duration over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101111` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation errors every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101112` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation error rate over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101113` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Records scanned every 5m
- -The number of candidate records considered for cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101200` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Records altered every 5m
- -The number of candidate records altered as part of cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101201` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operations every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101210` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -99th percentile successful job invocation operation duration over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101211` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation errors every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101212` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation error rate over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101213` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Records scanned every 5m
- -The number of candidate records considered for cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101300` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Records altered every 5m
- -The number of candidate records altered as part of cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101301` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operations every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101310` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -99th percentile successful job invocation operation duration over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101311` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation errors every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101312` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation error rate over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101313` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Records scanned every 5m
- -The number of candidate records considered for cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101400` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Records altered every 5m
- -The number of candidate records altered as part of cleanup. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101401` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operations every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101410` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -99th percentile successful job invocation operation duration over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101411` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation errors every 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101412` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Job invocation operation error rate over 5m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101413` on your Sourcegraph instance. - -*Managed by the Sourcegraph Code Understanding team.* - -Monitoring telemetry services in Sourcegraph.
- -To see this dashboard, visit `/-/debug/grafana/d/telemetry/telemetry` on your Sourcegraph instance. - -### Telemetry: Telemetry Gateway Exporter: Events export and queue metrics - -#### telemetry: telemetry_gateway_exporter_queue_size - -Telemetry event payloads pending export
- -The number of events queued to be exported. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100000` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Rate of growth of events export queue over 30m
- -A positive value indicates the queue is growing. - -Refer to the [alerts reference](alerts#telemetry-telemetry_gateway_exporter_queue_growth) for 2 alerts related to this panel. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100001` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events exported from queue per hour
- -The number of events being exported. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100010` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Number of events exported per batch over 30m
- -The number of events exported in each batch. The largest bucket is the maximum number of events exported per batch. -If the distribution trends to the maximum bucket, then events export throughput is at or approaching saturation - try increasing `TELEMETRY_GATEWAY_EXPORTER_EXPORT_BATCH_SIZE` or decreasing `TELEMETRY_GATEWAY_EXPORTER_EXPORT_INTERVAL`. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100011` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events exporter operations every 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100100` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Aggregate successful events exporter operation duration distribution over 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100101` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events exporter operation errors every 30m
- -Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter_exporter_errors_total) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100102` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events exporter operation error rate over 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100103` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events export queue cleanup operations every 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100200` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Aggregate successful events export queue cleanup operation duration distribution over 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100201` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events export queue cleanup operation errors every 30m
- -Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter_queue_cleanup_errors_total) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100202` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events export queue cleanup operation error rate over 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100203` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events export backlog metrics reporting operations every 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100300` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Aggregate successful events export backlog metrics reporting operation duration distribution over 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100301` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events export backlog metrics reporting operation errors every 30m
- -Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter_queue_metrics_reporter_errors_total) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100302` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Events export backlog metrics reporting operation error rate over 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100303` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Failed writes to events export queue over 5m
- -Telemetry V2 writes send events into the `telemetry_events_export_queue` for the exporter to periodically export. - -Refer to the [alerts reference](alerts#telemetry-telemetry_v2_export_queue_write_failures) for 2 alerts related to this panel. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100400` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Failed write V2 events to V1 'event_logs' over 5m
- -Telemetry V2 writes also attempt to `tee` events into the legacy V1 events format in the `event_logs` database table for long-term local persistence. - -Refer to the [alerts reference](alerts#telemetry-telemetry_v2_event_logs_write_failures) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100401` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -(off by default) user metadata exporter operations every 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100500` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Aggregate successful (off by default) user metadata exporter operation duration distribution over 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100501` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -(off by default) user metadata exporter operation errors every 30m
- -Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter_usermetadata_exporter_errors_total) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100502` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -(off by default) user metadata exporter operation error rate over 30m
- -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100503` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -The OpenTelemetry collector ingests OpenTelemetry data from Sourcegraph and exports it to the configured backends.
- -To see this dashboard, visit `/-/debug/grafana/d/otel-collector/otel-collector` on your Sourcegraph instance. - -### OpenTelemetry Collector: Receivers - -#### otel-collector: otel_span_receive_rate - -Spans received per receiver per minute
- -Shows the rate of spans accepted by the configured reveiver - -A Trace is a collection of spans and a span represents a unit of work or operation. Spans are the building blocks of Traces. -The spans have only been accepted by the receiver, which means they still have to move through the configured pipeline to be exported. -For more information on tracing and configuration of a OpenTelemetry receiver see https://opentelemetry.io/docs/collector/configuration/#receivers. - -See the Exporters section see spans that have made it through the pipeline and are exported. - -Depending the configured processors, received spans might be dropped and not exported. For more information on configuring processors see -https://opentelemetry.io/docs/collector/configuration/#processors. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100000` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Spans refused per receiver
- - - -Refer to the [alerts reference](alerts#otel-collector-otel_span_refused) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100001` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Spans exported per exporter per minute
- -Shows the rate of spans being sent by the exporter - -A Trace is a collection of spans. A Span represents a unit of work or operation. Spans are the building blocks of Traces. -The rate of spans here indicates spans that have made it through the configured pipeline and have been sent to the configured export destination. - -For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100100` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Span export failures by exporter
- -Shows the rate of spans failed to be sent by the configured reveiver. A number higher than 0 for a long period can indicate a problem with the exporter configuration or with the service that is being exported too - -For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. - -Refer to the [alerts reference](alerts#otel-collector-otel_span_export_failures) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100101` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Exporter queue capacity
- -Shows the the capacity of the retry queue (in batches). - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100200` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Exporter queue size
- -Shows the current size of retry queue - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100201` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Exporter enqueue failed spans
- -Shows the rate of spans failed to be enqueued by the configured exporter. A number higher than 0 for a long period can indicate a problem with the exporter configuration - -Refer to the [alerts reference](alerts#otel-collector-otelcol_exporter_enqueue_failed_spans) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100202` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Spans dropped per processor per minute
- -Shows the rate of spans dropped by the configured processor - -Refer to the [alerts reference](alerts#otel-collector-otelcol_processor_dropped_spans) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100300` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Cpu usage of the collector
- -Shows CPU usage as reported by the OpenTelemetry collector. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100400` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Memory allocated to the otel collector
- -Shows the allocated memory Resident Set Size (RSS) as reported by the OpenTelemetry collector. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100401` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Memory used by the collector
- -Shows how much memory is being used by the otel collector. - -* High memory usage might indicate thad the configured pipeline is keeping a lot of spans in memory for processing -* Spans failing to be sent and the exporter is configured to retry -* A high batch count by using a batch processor - -For more information on configuring processors for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#processors. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100402` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Container missing
- -This value is the number of times a container has not been seen for more than one minute. If you observe this -value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. - -- **Kubernetes:** - - Determine if the pod was OOM killed using `kubectl describe pod otel-collector` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p otel-collector`. -- **Docker Compose:** - - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' otel-collector` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the otel-collector container in `docker-compose.yml`. - - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs otel-collector` (note this will include logs from the previous and currently running container). - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100500` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Container cpu usage total (1m average) across all cores by instance
- -Refer to the [alerts reference](alerts#otel-collector-container_cpu_usage) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100501` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Container memory usage by instance
- -Refer to the [alerts reference](alerts#otel-collector-container_memory_usage) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100502` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Filesystem reads and writes rate by instance over 1h
- -This value indicates the number of filesystem read and write operations by containers of this service. -When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100503` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* - -Percentage pods available
- -Refer to the [alerts reference](alerts#otel-collector-pods_available_percentage) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100600` on your Sourcegraph instance. - -*Managed by the Sourcegraph Platform team.* +*Managed by the Sourcegraph Code Understanding team.*Cody chat and code completions.
- -To see this dashboard, visit `/-/debug/grafana/d/completions/completions` on your Sourcegraph instance. - -### Completions: Completions requests - -#### completions: api_request_rate - -Rate of completions API requests
+#### codeintel-uploads: codeintel_uploads_hard_deleter_errors_total -Rate (QPS) of requests to cody chat and code completion endpoints. +Job invocation operation errors every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100000` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101012` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: total time (p99)
+#### codeintel-uploads: codeintel_uploads_hard_deleter_error_rate -Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. +Job invocation operation error rate over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101013` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: total time (p95)
+Records scanned every 5m
-Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. +The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101100` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: total time (p75)
+Records altered every 5m
-Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. +The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101101` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: total time (p50)
+#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_total -Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. +Job invocation operations every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100103` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101110` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Non-stream overhead (p99)
+#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_99th_percentile_duration -Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. +99th percentile successful job invocation operation duration over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100110` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101111` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Non-stream overhead (p95)
+#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_errors_total -Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. +Job invocation operation errors every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100111` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101112` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Non-stream overhead (p75)
+#### codeintel-uploads: codeintel_uploads_janitor_audit_logs_error_rate -Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. +Job invocation operation error rate over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100112` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101113` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Non-stream overhead (p50)
+Records scanned every 5m
-Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. +The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100113` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101200` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: time to first event (p99)
+Records altered every 5m
-Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. +The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100120` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101201` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: time to first event (p95)
+#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_total -Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. +Job invocation operations every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100121` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101210` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: time to first event (p75)
+#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_99th_percentile_duration -Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. +99th percentile successful job invocation operation duration over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100122` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101211` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: time to first event (p50)
+#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_errors_total -Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. +Job invocation operation errors every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100123` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101212` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: first byte sent -> received (p99)
+#### codeintel-uploads: codeintel_uploads_janitor_scip_documents_error_rate -Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. +Job invocation operation error rate over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100130` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101213` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: first byte sent -> received (p95)
+Records scanned every 5m
-Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. +The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100131` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101300` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: first byte sent -> received (p75)
+Records altered every 5m
-Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. +The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100132` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101301` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: first byte sent -> received (p50)
+#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_total -Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. +Job invocation operations every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100133` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101310` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: total (p99)
+#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_99th_percentile_duration -Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. +99th percentile successful job invocation operation duration over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100140` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101311` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: total (p95)
+#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_errors_total -Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. +Job invocation operation errors every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100141` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101312` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: total (p75)
+#### codeintel-uploads: codeintel_uploads_reconciler_scip_metadata_error_rate -Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. +Job invocation operation error rate over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100142` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101313` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: total (p50)
+Records scanned every 5m
-Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. +The number of candidate records considered for cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100143` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101400` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: dns (p99)
+Records altered every 5m
-Portion of time spent on DNS when acquiring an HTTP connection to the upstream. +The number of candidate records altered as part of cleanup. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100150` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101401` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: dns (p95)
+#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_total -Portion of time spent on DNS when acquiring an HTTP connection to the upstream. +Job invocation operations every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100151` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101410` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: dns (p75)
+#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_99th_percentile_duration -Portion of time spent on DNS when acquiring an HTTP connection to the upstream. +99th percentile successful job invocation operation duration over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100152` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101411` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: dns (p50)
+#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_errors_total -Portion of time spent on DNS when acquiring an HTTP connection to the upstream. +Job invocation operation errors every 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100153` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101412` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Stream: HTTP connect: tls (p99)
+#### codeintel-uploads: codeintel_uploads_reconciler_scip_data_error_rate -Portion of time spent on TLS when acquiring an HTTP connection to the upstream. +Job invocation operation error rate over 5m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100160` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/codeintel-uploads/codeintel-uploads?viewPanel=101413` on your Sourcegraph instance. +*Managed by the Sourcegraph Code Understanding team.*Monitoring telemetry services in Sourcegraph.
+ +To see this dashboard, visit `/-/debug/grafana/d/telemetry/telemetry` on your Sourcegraph instance. + +### Telemetry: Telemetry Gateway Exporter: Events export and queue metrics + +#### telemetry: telemetry_gateway_exporter_queue_size -Stream: HTTP connect: tls (p95)
+Telemetry event payloads pending export
-Portion of time spent on TLS when acquiring an HTTP connection to the upstream. +The number of events queued to be exported. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100161` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100000` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: HTTP connect: tls (p75)
+Rate of growth of events export queue over 30m
-Portion of time spent on TLS when acquiring an HTTP connection to the upstream. +A positive value indicates the queue is growing. -This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetry_gateway_exporter_queue_growth) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100162` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100001` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: HTTP connect: tls (p50)
+Events exported from queue per hour
-Portion of time spent on TLS when acquiring an HTTP connection to the upstream. +The number of events being exported. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100163` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100010` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: HTTP connect: dial (p99)
+Number of events exported per batch over 30m
-Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. +The number of events exported in each batch. The largest bucket is the maximum number of events exported per batch. +If the distribution trends to the maximum bucket, then events export throughput is at or approaching saturation - try increasing `TELEMETRY_GATEWAY_EXPORTER_EXPORT_BATCH_SIZE` or decreasing `TELEMETRY_GATEWAY_EXPORTER_EXPORT_INTERVAL`. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100170` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100011` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: HTTP connect: dial (p95)
+#### telemetry: telemetrygatewayexporter_exporter_total -Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. +Events exporter operations every 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100171` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100100` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: HTTP connect: dial (p75)
+#### telemetry: telemetrygatewayexporter_exporter_99th_percentile_duration -Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. +Aggregate successful events exporter operation duration distribution over 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100172` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100101` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: HTTP connect: dial (p50)
+#### telemetry: telemetrygatewayexporter_exporter_errors_total -Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. +Events exporter operation errors every 30m
-This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter_exporter_errors_total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100173` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100102` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: total time (p99)
+#### telemetry: telemetrygatewayexporter_exporter_error_rate -Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. +Events exporter operation error rate over 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100103` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: total time (p95)
+#### telemetry: telemetrygatewayexporter_queue_cleanup_total -Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. +Events export queue cleanup operations every 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100201` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100200` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: total time (p75)
+#### telemetry: telemetrygatewayexporter_queue_cleanup_99th_percentile_duration -Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. +Aggregate successful events export queue cleanup operation duration distribution over 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100202` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100201` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: total time (p50)
+#### telemetry: telemetrygatewayexporter_queue_cleanup_errors_total -Time spent on the Stream() invocation, i.e. how long results take to connect, stream results, and finish streaming. +Events export queue cleanup operation errors every 30m
-This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter_queue_cleanup_errors_total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100203` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100202` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Non-stream overhead (p99)
+#### telemetry: telemetrygatewayexporter_queue_cleanup_error_rate -Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. +Events export queue cleanup operation error rate over 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100203` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Non-stream overhead (p95)
+#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_total -Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. +Events export backlog metrics reporting operations every 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100300` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Non-stream overhead (p75)
+#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_99th_percentile_duration -Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. +Aggregate successful events export backlog metrics reporting operation duration distribution over 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100301` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Non-stream overhead (p50)
+#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_errors_total -Time between Go HTTP handler invocation and Stream() invocation, overhead of e.g. request validation, routing to gateway/other, model resolution, error reporting/tracing, guardrails, etc. +Events export backlog metrics reporting operation errors every 30m
-This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter_queue_metrics_reporter_errors_total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100213` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100302` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: time to first event (p99)
+#### telemetry: telemetrygatewayexporter_queue_metrics_reporter_error_rate -Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. +Events export backlog metrics reporting operation error rate over 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100220` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100303` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: time to first event (p95)
+#### telemetry: telemetry_v2_export_queue_write_failures -Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. +Failed writes to events export queue over 5m
-This panel has no related alerts. +Telemetry V2 writes send events into the `telemetry_events_export_queue` for the exporter to periodically export. + +Refer to the [alerts reference](alerts#telemetry-telemetry_v2_export_queue_write_failures) for 2 alerts related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100221` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100400` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: time to first event (p75)
+Failed write V2 events to V1 'event_logs' over 5m
-Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. +Telemetry V2 writes also attempt to `tee` events into the legacy V1 events format in the `event_logs` database table for long-term local persistence. -This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetry_v2_event_logs_write_failures) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100222` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100401` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: time to first event (p50)
+#### telemetry: telemetrygatewayexporter_usermetadata_exporter_total -Time between calling Stream(), the client connecting to the server etc. and actually getting the first streaming event back. +(off by default) user metadata exporter operations every 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100223` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100500` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: first byte sent -> received (p99)
+#### telemetry: telemetrygatewayexporter_usermetadata_exporter_99th_percentile_duration -Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. +Aggregate successful (off by default) user metadata exporter operation duration distribution over 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100230` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100501` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: first byte sent -> received (p95)
+#### telemetry: telemetrygatewayexporter_usermetadata_exporter_errors_total -Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. +(off by default) user metadata exporter operation errors every 30m
-This panel has no related alerts. +Refer to the [alerts reference](alerts#telemetry-telemetrygatewayexporter_usermetadata_exporter_errors_total) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100231` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100502` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*Stream: first byte sent -> received (p75)
+#### telemetry: telemetrygatewayexporter_usermetadata_exporter_error_rate -Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. +(off by default) user metadata exporter operation error rate over 30m
This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100232` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/telemetry/telemetry?viewPanel=100503` on your Sourcegraph instance. +*Managed by the Sourcegraph Services team.*The OpenTelemetry collector ingests OpenTelemetry data from Sourcegraph and exports it to the configured backends.
+ +To see this dashboard, visit `/-/debug/grafana/d/otel-collector/otel-collector` on your Sourcegraph instance. + +### OpenTelemetry Collector: Receivers + +#### otel-collector: otel_span_receive_rate + +Spans received per receiver per minute
+ +Shows the rate of spans accepted by the configured reveiver + +A Trace is a collection of spans and a span represents a unit of work or operation. Spans are the building blocks of Traces. +The spans have only been accepted by the receiver, which means they still have to move through the configured pipeline to be exported. +For more information on tracing and configuration of a OpenTelemetry receiver see https://opentelemetry.io/docs/collector/configuration/#receivers. -Stream: first byte sent -> received (p50)
+See the Exporters section see spans that have made it through the pipeline and are exported. -Time between sending the first byte to the upstream, and then getting the first byte back from the upstream. +Depending the configured processors, received spans might be dropped and not exported. For more information on configuring processors see +https://opentelemetry.io/docs/collector/configuration/#processors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100233` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100000` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Spans refused per receiver
-Stream: HTTP connect: total (p99)
-Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. -This panel has no related alerts. +Refer to the [alerts reference](alerts#otel-collector-otel_span_refused) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100240` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100001` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Spans exported per exporter per minute
+ +Shows the rate of spans being sent by the exporter -Stream: HTTP connect: total (p95)
+A Trace is a collection of spans. A Span represents a unit of work or operation. Spans are the building blocks of Traces. +The rate of spans here indicates spans that have made it through the configured pipeline and have been sent to the configured export destination. -Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. +For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100241` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100100` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: total (p75)
+Span export failures by exporter
-Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. +Shows the rate of spans failed to be sent by the configured reveiver. A number higher than 0 for a long period can indicate a problem with the exporter configuration or with the service that is being exported too -This panel has no related alerts. +For more information on configuring a exporter for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#exporters. + +Refer to the [alerts reference](alerts#otel-collector-otel_span_export_failures) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100242` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100101` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: total (p50)
+Exporter queue capacity
-Time spent acquiring an HTTP connection to the upstream, either from an existing pool OR by performing DNS resolution, TCP connection, etc. +Shows the the capacity of the retry queue (in batches). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100243` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100200` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: dns (p99)
+Exporter queue size
-Portion of time spent on DNS when acquiring an HTTP connection to the upstream. +Shows the current size of retry queue This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100250` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100201` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: dns (p95)
+Exporter enqueue failed spans
-Portion of time spent on DNS when acquiring an HTTP connection to the upstream. +Shows the rate of spans failed to be enqueued by the configured exporter. A number higher than 0 for a long period can indicate a problem with the exporter configuration -This panel has no related alerts. +Refer to the [alerts reference](alerts#otel-collector-otelcol_exporter_enqueue_failed_spans) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100251` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100202` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: dns (p75)
+#### otel-collector: otelcol_processor_dropped_spans -Portion of time spent on DNS when acquiring an HTTP connection to the upstream. +Spans dropped per processor per minute
-This panel has no related alerts. +Shows the rate of spans dropped by the configured processor + +Refer to the [alerts reference](alerts#otel-collector-otelcol_processor_dropped_spans) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100252` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100300` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: dns (p50)
+Cpu usage of the collector
-Portion of time spent on DNS when acquiring an HTTP connection to the upstream. +Shows CPU usage as reported by the OpenTelemetry collector. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100253` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100400` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: tls (p99)
+Memory allocated to the otel collector
-Portion of time spent on TLS when acquiring an HTTP connection to the upstream. +Shows the allocated memory Resident Set Size (RSS) as reported by the OpenTelemetry collector. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100260` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100401` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Memory used by the collector
+ +Shows how much memory is being used by the otel collector. -Stream: HTTP connect: tls (p95)
+* High memory usage might indicate thad the configured pipeline is keeping a lot of spans in memory for processing +* Spans failing to be sent and the exporter is configured to retry +* A high batch count by using a batch processor -Portion of time spent on TLS when acquiring an HTTP connection to the upstream. +For more information on configuring processors for the OpenTelemetry collector see https://opentelemetry.io/docs/collector/configuration/#processors. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100261` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100402` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Container missing
-Stream: HTTP connect: tls (p75)
+This value is the number of times a container has not been seen for more than one minute. If you observe this +value change independent of deployment events (such as an upgrade), it could indicate pods are being OOM killed or terminated for some other reasons. -Portion of time spent on TLS when acquiring an HTTP connection to the upstream. +- **Kubernetes:** + - Determine if the pod was OOM killed using `kubectl describe pod otel-collector` (look for `OOMKilled: true`) and, if so, consider increasing the memory limit in the relevant `Deployment.yaml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `kubectl logs -p otel-collector`. +- **Docker Compose:** + - Determine if the pod was OOM killed using `docker inspect -f '\{\{json .State\}\}' otel-collector` (look for `"OOMKilled":true`) and, if so, consider increasing the memory limit of the otel-collector container in `docker-compose.yml`. + - Check the logs before the container restarted to see if there are `panic:` messages or similar using `docker logs otel-collector` (note this will include logs from the previous and currently running container). This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100262` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100500` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: tls (p50)
+#### otel-collector: container_cpu_usage -Portion of time spent on TLS when acquiring an HTTP connection to the upstream. +Container cpu usage total (1m average) across all cores by instance
-This panel has no related alerts. +Refer to the [alerts reference](alerts#otel-collector-container_cpu_usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100263` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100501` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: dial (p99)
+#### otel-collector: container_memory_usage -Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. +Container memory usage by instance
-This panel has no related alerts. +Refer to the [alerts reference](alerts#otel-collector-container_memory_usage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100270` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100502` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: dial (p95)
+Filesystem reads and writes rate by instance over 1h
-Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. +This value indicates the number of filesystem read and write operations by containers of this service. +When extremely high, this can indicate a resource usage problem, or can cause problems with the service itself, especially if high values or spikes correlate with \{\{CONTAINER_NAME\}\} issues. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100271` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100503` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Stream: HTTP connect: dial (p75)
+#### otel-collector: pods_available_percentage -Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. +Percentage pods available
-This panel has no related alerts. +Refer to the [alerts reference](alerts#otel-collector-pods_available_percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100272` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/otel-collector/otel-collector?viewPanel=100600` on your Sourcegraph instance. +*Managed by the Sourcegraph Platform team.*Cody chat and code completions.
+ +To see this dashboard, visit `/-/debug/grafana/d/completions/completions` on your Sourcegraph instance. + +### Completions: Completions requests + +#### completions: api_request_rate -Stream: HTTP connect: dial (p50)
+Rate of completions API requests
-Portion of time spent on golang Dial() when acquiring an HTTP connection to the upstream. +Rate (QPS) of requests to cody chat and code completion endpoints. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100273` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/completions/completions?viewPanel=100000` on your Sourcegraph instance.