Introduce query timeout configuration (#2157)

## Summary

Implements configurable query execution timeout controls to prevent
poorly optimized or excessive queries from consuming excessive server
resources, causing performance degradation, or crashing the Zabbix
server.

Fixes: https://github.com/grafana/oss-big-tent-squad/issues/127

## Problem

Previously, the plugin only had an HTTP connection timeout (`timeout`)
that controlled individual API request timeouts. However, a complete
query execution could involve multiple API calls and run indefinitely if
not properly controlled, potentially causing resource exhaustion.

## Solution

Added a new `queryTimeout` setting that enforces a maximum execution
time for entire database queries initiated by the plugin. Queries
exceeding this limit are automatically terminated with proper error
handling and logging.

## Testing

1. Configure a datasource with `queryTimeout` set to a low value (e.g.,
5 seconds)
2. Execute a query that would normally take longer than the timeout
3. Verify that:
   - Query is terminated after the timeout period
   - Error message indicates timeout occurred
   - Logs contain timeout warning with query details
   - Other queries in the same request continue to execute

## Notes

- `queryTimeout` is separate from `timeout` (HTTP connection timeout)
- `queryTimeout` applies to the entire query execution, which may
involve multiple API calls
- Default value of 60 seconds ensures reasonable protection while
allowing normal queries to complete
- Timeout errors are logged with query refId, queryType, timeout
duration, and datasourceId for troubleshooting
This commit is contained in:
ismail simsek
2026-01-12 15:30:31 +01:00
committed by GitHub
parent 7eb80d3f23
commit a2f8b6433a
7 changed files with 366 additions and 50 deletions

View File

@@ -4,16 +4,20 @@ import (
"context"
"errors"
"fmt"
"net/http"
"time"
"github.com/grafana/grafana-plugin-sdk-go/backend"
"github.com/grafana/grafana-plugin-sdk-go/backend/datasource"
"github.com/grafana/grafana-plugin-sdk-go/backend/instancemgmt"
"github.com/grafana/grafana-plugin-sdk-go/backend/log"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/alexanderzobnin/grafana-zabbix/pkg/httpclient"
"github.com/alexanderzobnin/grafana-zabbix/pkg/metrics"
"github.com/alexanderzobnin/grafana-zabbix/pkg/settings"
"github.com/alexanderzobnin/grafana-zabbix/pkg/zabbix"
"github.com/alexanderzobnin/grafana-zabbix/pkg/zabbixapi"
"github.com/grafana/grafana-plugin-sdk-go/backend"
"github.com/grafana/grafana-plugin-sdk-go/backend/datasource"
"github.com/grafana/grafana-plugin-sdk-go/backend/instancemgmt"
"github.com/grafana/grafana-plugin-sdk-go/backend/log"
)
var (
@@ -113,6 +117,11 @@ func (ds *ZabbixDatasource) QueryData(ctx context.Context, req *backend.QueryDat
return nil, err
}
queryTimeout := zabbixDS.Settings.QueryTimeout
if queryTimeout <= 0 {
queryTimeout = 60 * time.Second // Default to 60 seconds if not configured
}
for _, q := range req.Queries {
res := backend.DataResponse{}
query, err := ReadQuery(q)
@@ -122,22 +131,52 @@ func (ds *ZabbixDatasource) QueryData(ctx context.Context, req *backend.QueryDat
} else if err := ValidateTimeRange(query.TimeRange); err != nil {
// Validate time range before processing any query
res = backend.ErrorResponseWithErrorSource(err)
} else if query.QueryType == MODE_METRICS {
frames, err := zabbixDS.queryNumericItems(ctx, &query)
if err != nil {
res = backend.ErrorResponseWithErrorSource(err)
} else {
res.Frames = append(res.Frames, frames...)
}
} else if query.QueryType == MODE_ITEMID {
frames, err := zabbixDS.queryItemIdData(ctx, &query)
if err != nil {
res = backend.ErrorResponseWithErrorSource(err)
} else {
res.Frames = append(res.Frames, frames...)
}
} else {
res = backend.ErrorResponseWithErrorSource(backend.DownstreamError(ErrNonMetricQueryNotSupported))
// Create a context with timeout for this specific query
queryCtx, cancel := context.WithTimeout(ctx, queryTimeout)
// Execute query with timeout context in an anonymous function to ensure cancel is called after each iteration
func() {
defer cancel()
var frames []*data.Frame
var queryErr error
switch query.QueryType {
case MODE_METRICS:
frames, queryErr = zabbixDS.queryNumericItems(queryCtx, &query)
case MODE_ITEMID:
frames, queryErr = zabbixDS.queryItemIdData(queryCtx, &query)
default:
queryErr = backend.DownstreamError(ErrNonMetricQueryNotSupported)
}
// Check if query timed out
if queryErr != nil {
if errors.Is(queryCtx.Err(), context.DeadlineExceeded) {
// Query exceeded the configured timeout
timeoutMsg := fmt.Sprintf(
"Query execution exceeded maximum allowed time (%v). Query was automatically terminated to prevent excessive resource consumption.",
queryTimeout,
)
ds.logger.Warn(
"Query timeout exceeded",
"refId", q.RefID,
"queryType", query.QueryType,
"timeout", queryTimeout,
"datasourceId", req.PluginContext.DataSourceInstanceSettings.ID,
)
res = backend.ErrorResponseWithErrorSource(
backend.DownstreamError(fmt.Errorf("query timeout: %s", timeoutMsg)),
)
res.Status = http.StatusRequestTimeout
} else {
res = backend.ErrorResponseWithErrorSource(queryErr)
}
} else {
res.Frames = append(res.Frames, frames...)
}
}()
}
qdr.Responses[q.RefID] = res
}

View File

@@ -2,8 +2,12 @@ package datasource
import (
"context"
"encoding/json"
"strings"
"testing"
"time"
"github.com/alexanderzobnin/grafana-zabbix/pkg/settings"
"github.com/grafana/grafana-plugin-sdk-go/backend"
"gotest.tools/assert"
)
@@ -66,3 +70,101 @@ func TestZabbixBackend_getCachedDatasource(t *testing.T) {
})
}
}
func TestQueryData_QueryTimeoutConfiguration(t *testing.T) {
tests := []struct {
name string
queryTimeout interface{}
expectedTimeout time.Duration
description string
}{
{
name: "Default timeout when not configured",
queryTimeout: nil,
expectedTimeout: 60 * time.Second,
description: "Should use default 60 seconds when queryTimeout is not set",
},
{
name: "Default timeout when zero",
queryTimeout: 0,
expectedTimeout: 60 * time.Second,
description: "Should use default 60 seconds when queryTimeout is 0",
},
{
name: "Custom timeout configured",
queryTimeout: 30,
expectedTimeout: 30 * time.Second,
description: "Should use configured queryTimeout value",
},
{
name: "Custom timeout as string",
queryTimeout: "45",
expectedTimeout: 45 * time.Second,
description: "Should parse string queryTimeout value",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create datasource settings with queryTimeout
jsonData := map[string]interface{}{
"queryTimeout": tt.queryTimeout,
}
jsonBytes, _ := json.Marshal(jsonData)
dsSettings := backend.DataSourceInstanceSettings{
ID: 1,
Name: "TestDatasource",
URL: "http://zabbix.org/zabbix",
JSONData: jsonBytes,
}
// Parse settings to verify timeout is set correctly
zabbixSettings, err := settings.ReadZabbixSettings(&dsSettings)
assert.NilError(t, err)
assert.Equal(t, tt.expectedTimeout, zabbixSettings.QueryTimeout, tt.description)
})
}
}
func TestQueryData_QueryTimeoutContextCreation(t *testing.T) {
// Test that query timeout context is properly created with the configured timeout
jsonData := map[string]interface{}{
"queryTimeout": 5, // 5 seconds
}
jsonBytes, _ := json.Marshal(jsonData)
dsSettings := backend.DataSourceInstanceSettings{
ID: 1,
Name: "TestDatasource",
URL: "http://zabbix.org/zabbix",
JSONData: jsonBytes,
}
// Verify queryTimeout is set correctly
zabbixSettings, err := settings.ReadZabbixSettings(&dsSettings)
assert.NilError(t, err)
assert.Equal(t, 5*time.Second, zabbixSettings.QueryTimeout)
// Test that context with timeout is created correctly
ctx := context.Background()
queryCtx, cancel := context.WithTimeout(ctx, zabbixSettings.QueryTimeout)
defer cancel()
// Verify context has deadline set
deadline, ok := queryCtx.Deadline()
assert.Assert(t, ok, "Context should have a deadline")
assert.Assert(t, deadline.After(time.Now()), "Deadline should be in the future")
assert.Assert(t, deadline.Before(time.Now().Add(6*time.Second)), "Deadline should be approximately 5 seconds from now")
}
func TestQueryData_QueryTimeoutErrorMessage(t *testing.T) {
// Test that timeout error message contains the expected information
timeoutMsg := "Query execution exceeded maximum allowed time (5s). Query was automatically terminated to prevent excessive resource consumption."
// Verify error message format
assert.Assert(t, strings.Contains(timeoutMsg, "Query execution exceeded maximum allowed time"))
assert.Assert(t, strings.Contains(timeoutMsg, "5s"))
assert.Assert(t, strings.Contains(timeoutMsg, "automatically terminated"))
assert.Assert(t, strings.Contains(timeoutMsg, "prevent excessive resource consumption"))
}