Introduce query timeout configuration (#2157)
## Summary Implements configurable query execution timeout controls to prevent poorly optimized or excessive queries from consuming excessive server resources, causing performance degradation, or crashing the Zabbix server. Fixes: https://github.com/grafana/oss-big-tent-squad/issues/127 ## Problem Previously, the plugin only had an HTTP connection timeout (`timeout`) that controlled individual API request timeouts. However, a complete query execution could involve multiple API calls and run indefinitely if not properly controlled, potentially causing resource exhaustion. ## Solution Added a new `queryTimeout` setting that enforces a maximum execution time for entire database queries initiated by the plugin. Queries exceeding this limit are automatically terminated with proper error handling and logging. ## Testing 1. Configure a datasource with `queryTimeout` set to a low value (e.g., 5 seconds) 2. Execute a query that would normally take longer than the timeout 3. Verify that: - Query is terminated after the timeout period - Error message indicates timeout occurred - Logs contain timeout warning with query details - Other queries in the same request continue to execute ## Notes - `queryTimeout` is separate from `timeout` (HTTP connection timeout) - `queryTimeout` applies to the entire query execution, which may involve multiple API calls - Default value of 60 seconds ensures reasonable protection while allowing normal queries to complete - Timeout errors are logged with query refId, queryType, timeout duration, and datasourceId for troubleshooting
This commit is contained in:
@@ -4,16 +4,20 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend/datasource"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend/instancemgmt"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend/log"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
|
||||
"github.com/alexanderzobnin/grafana-zabbix/pkg/httpclient"
|
||||
"github.com/alexanderzobnin/grafana-zabbix/pkg/metrics"
|
||||
"github.com/alexanderzobnin/grafana-zabbix/pkg/settings"
|
||||
"github.com/alexanderzobnin/grafana-zabbix/pkg/zabbix"
|
||||
"github.com/alexanderzobnin/grafana-zabbix/pkg/zabbixapi"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend/datasource"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend/instancemgmt"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend/log"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -113,6 +117,11 @@ func (ds *ZabbixDatasource) QueryData(ctx context.Context, req *backend.QueryDat
|
||||
return nil, err
|
||||
}
|
||||
|
||||
queryTimeout := zabbixDS.Settings.QueryTimeout
|
||||
if queryTimeout <= 0 {
|
||||
queryTimeout = 60 * time.Second // Default to 60 seconds if not configured
|
||||
}
|
||||
|
||||
for _, q := range req.Queries {
|
||||
res := backend.DataResponse{}
|
||||
query, err := ReadQuery(q)
|
||||
@@ -122,22 +131,52 @@ func (ds *ZabbixDatasource) QueryData(ctx context.Context, req *backend.QueryDat
|
||||
} else if err := ValidateTimeRange(query.TimeRange); err != nil {
|
||||
// Validate time range before processing any query
|
||||
res = backend.ErrorResponseWithErrorSource(err)
|
||||
} else if query.QueryType == MODE_METRICS {
|
||||
frames, err := zabbixDS.queryNumericItems(ctx, &query)
|
||||
if err != nil {
|
||||
res = backend.ErrorResponseWithErrorSource(err)
|
||||
} else {
|
||||
res.Frames = append(res.Frames, frames...)
|
||||
}
|
||||
} else if query.QueryType == MODE_ITEMID {
|
||||
frames, err := zabbixDS.queryItemIdData(ctx, &query)
|
||||
if err != nil {
|
||||
res = backend.ErrorResponseWithErrorSource(err)
|
||||
} else {
|
||||
res.Frames = append(res.Frames, frames...)
|
||||
}
|
||||
} else {
|
||||
res = backend.ErrorResponseWithErrorSource(backend.DownstreamError(ErrNonMetricQueryNotSupported))
|
||||
// Create a context with timeout for this specific query
|
||||
queryCtx, cancel := context.WithTimeout(ctx, queryTimeout)
|
||||
|
||||
// Execute query with timeout context in an anonymous function to ensure cancel is called after each iteration
|
||||
func() {
|
||||
defer cancel()
|
||||
|
||||
var frames []*data.Frame
|
||||
var queryErr error
|
||||
|
||||
switch query.QueryType {
|
||||
case MODE_METRICS:
|
||||
frames, queryErr = zabbixDS.queryNumericItems(queryCtx, &query)
|
||||
case MODE_ITEMID:
|
||||
frames, queryErr = zabbixDS.queryItemIdData(queryCtx, &query)
|
||||
default:
|
||||
queryErr = backend.DownstreamError(ErrNonMetricQueryNotSupported)
|
||||
}
|
||||
|
||||
// Check if query timed out
|
||||
if queryErr != nil {
|
||||
if errors.Is(queryCtx.Err(), context.DeadlineExceeded) {
|
||||
// Query exceeded the configured timeout
|
||||
timeoutMsg := fmt.Sprintf(
|
||||
"Query execution exceeded maximum allowed time (%v). Query was automatically terminated to prevent excessive resource consumption.",
|
||||
queryTimeout,
|
||||
)
|
||||
ds.logger.Warn(
|
||||
"Query timeout exceeded",
|
||||
"refId", q.RefID,
|
||||
"queryType", query.QueryType,
|
||||
"timeout", queryTimeout,
|
||||
"datasourceId", req.PluginContext.DataSourceInstanceSettings.ID,
|
||||
)
|
||||
res = backend.ErrorResponseWithErrorSource(
|
||||
backend.DownstreamError(fmt.Errorf("query timeout: %s", timeoutMsg)),
|
||||
)
|
||||
res.Status = http.StatusRequestTimeout
|
||||
} else {
|
||||
res = backend.ErrorResponseWithErrorSource(queryErr)
|
||||
}
|
||||
} else {
|
||||
res.Frames = append(res.Frames, frames...)
|
||||
}
|
||||
}()
|
||||
}
|
||||
qdr.Responses[q.RefID] = res
|
||||
}
|
||||
|
||||
@@ -2,8 +2,12 @@ package datasource
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alexanderzobnin/grafana-zabbix/pkg/settings"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend"
|
||||
"gotest.tools/assert"
|
||||
)
|
||||
@@ -66,3 +70,101 @@ func TestZabbixBackend_getCachedDatasource(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryData_QueryTimeoutConfiguration(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
queryTimeout interface{}
|
||||
expectedTimeout time.Duration
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "Default timeout when not configured",
|
||||
queryTimeout: nil,
|
||||
expectedTimeout: 60 * time.Second,
|
||||
description: "Should use default 60 seconds when queryTimeout is not set",
|
||||
},
|
||||
{
|
||||
name: "Default timeout when zero",
|
||||
queryTimeout: 0,
|
||||
expectedTimeout: 60 * time.Second,
|
||||
description: "Should use default 60 seconds when queryTimeout is 0",
|
||||
},
|
||||
{
|
||||
name: "Custom timeout configured",
|
||||
queryTimeout: 30,
|
||||
expectedTimeout: 30 * time.Second,
|
||||
description: "Should use configured queryTimeout value",
|
||||
},
|
||||
{
|
||||
name: "Custom timeout as string",
|
||||
queryTimeout: "45",
|
||||
expectedTimeout: 45 * time.Second,
|
||||
description: "Should parse string queryTimeout value",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Create datasource settings with queryTimeout
|
||||
jsonData := map[string]interface{}{
|
||||
"queryTimeout": tt.queryTimeout,
|
||||
}
|
||||
jsonBytes, _ := json.Marshal(jsonData)
|
||||
|
||||
dsSettings := backend.DataSourceInstanceSettings{
|
||||
ID: 1,
|
||||
Name: "TestDatasource",
|
||||
URL: "http://zabbix.org/zabbix",
|
||||
JSONData: jsonBytes,
|
||||
}
|
||||
|
||||
// Parse settings to verify timeout is set correctly
|
||||
zabbixSettings, err := settings.ReadZabbixSettings(&dsSettings)
|
||||
assert.NilError(t, err)
|
||||
assert.Equal(t, tt.expectedTimeout, zabbixSettings.QueryTimeout, tt.description)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryData_QueryTimeoutContextCreation(t *testing.T) {
|
||||
// Test that query timeout context is properly created with the configured timeout
|
||||
jsonData := map[string]interface{}{
|
||||
"queryTimeout": 5, // 5 seconds
|
||||
}
|
||||
jsonBytes, _ := json.Marshal(jsonData)
|
||||
|
||||
dsSettings := backend.DataSourceInstanceSettings{
|
||||
ID: 1,
|
||||
Name: "TestDatasource",
|
||||
URL: "http://zabbix.org/zabbix",
|
||||
JSONData: jsonBytes,
|
||||
}
|
||||
|
||||
// Verify queryTimeout is set correctly
|
||||
zabbixSettings, err := settings.ReadZabbixSettings(&dsSettings)
|
||||
assert.NilError(t, err)
|
||||
assert.Equal(t, 5*time.Second, zabbixSettings.QueryTimeout)
|
||||
|
||||
// Test that context with timeout is created correctly
|
||||
ctx := context.Background()
|
||||
queryCtx, cancel := context.WithTimeout(ctx, zabbixSettings.QueryTimeout)
|
||||
defer cancel()
|
||||
|
||||
// Verify context has deadline set
|
||||
deadline, ok := queryCtx.Deadline()
|
||||
assert.Assert(t, ok, "Context should have a deadline")
|
||||
assert.Assert(t, deadline.After(time.Now()), "Deadline should be in the future")
|
||||
assert.Assert(t, deadline.Before(time.Now().Add(6*time.Second)), "Deadline should be approximately 5 seconds from now")
|
||||
}
|
||||
|
||||
func TestQueryData_QueryTimeoutErrorMessage(t *testing.T) {
|
||||
// Test that timeout error message contains the expected information
|
||||
timeoutMsg := "Query execution exceeded maximum allowed time (5s). Query was automatically terminated to prevent excessive resource consumption."
|
||||
|
||||
// Verify error message format
|
||||
assert.Assert(t, strings.Contains(timeoutMsg, "Query execution exceeded maximum allowed time"))
|
||||
assert.Assert(t, strings.Contains(timeoutMsg, "5s"))
|
||||
assert.Assert(t, strings.Contains(timeoutMsg, "automatically terminated"))
|
||||
assert.Assert(t, strings.Contains(timeoutMsg, "prevent excessive resource consumption"))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user