diff --git a/client/pkg/caller/caller.go b/client/pkg/caller/caller.go index 04bd221216b..3d6497682c1 100644 --- a/client/pkg/caller/caller.go +++ b/client/pkg/caller/caller.go @@ -31,8 +31,27 @@ type ( const ( // TestID is used for test. TestID ID = "test" + // TestComponent is used for test. TestComponent Component = "test" + + // client-go component + CodecPDClient Component = "codec-pd-client" + RegionCache Component = "region-cache" + StoreCache Component = "store-cache" + Oracles Component = "oracle" + Rawkv Component = "rawkv" + KvStore Component = "kv-store" + InterceptedPDClient Component = "intercepted-pd-client" + + // TiDB component + Pitr Component = "pitr" + Ddl Component = "ddl" + ImportInto Component = "import-into" + TikvHandler Component = "tikv-handler" + GcWorker Component = "gc-worker" + GcJob Component = "gc-job" + DistributedGcJob Component = "distributed-gc-job" ) var processName ID diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 4fc58afe772..4f3c5eab10f 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -9157,6 +9157,228 @@ "steppedLine": false, "timeFrom": null, "timeShift": null + }, + { + "aliasColors": {}, + "dashLength": 10, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The region request from different source.", + "editable": true, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 135 + }, + "id": 906, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "pluginVersion": "7.5.17", + "pointradius": 5, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "targets": [ + { + "expr": "sum(rate(pd_server_region_request_cnt{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (request, caller_id, caller_component)", + "legendFormat": "{{request}}-{{caller_id}}->{{caller_component}}", + "interval": "", + "exemplar": true, + "intervalFactor": 2, + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Total Region Request", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 10, + "max": null, + "min": null, + "show": true, + "$$hashKey": "object:132" + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true, + "$$hashKey": "object:133" + } + ], + "yaxis": { + "align": false, + "alignLevel": null + }, + "bars": false, + "dashes": false, + "decimals": null, + "error": false, + "fillGradient": 0, + "hiddenSeries": false, + "percentage": false, + "points": false, + "stack": false, + "steppedLine": false, + "timeFrom": null, + "timeShift": null + }, + { + "aliasColors": {}, + "dashLength": 10, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The region request from different source.", + "editable": true, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 143 + }, + "id": 907, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "pluginVersion": "7.5.17", + "pointradius": 5, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "targets": [ + { + "expr": "sum(rate(pd_server_region_request_cnt{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", err_msg!=\"\"}[1m])) by (request, caller_id, caller_component)", + "legendFormat": "{{request}}-{{caller_id}}->{{caller_component}}", + "interval": "", + "exemplar": true, + "intervalFactor": 2, + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Failed Region Request", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 10, + "max": null, + "min": null, + "show": true, + "$$hashKey": "object:132" + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true, + "$$hashKey": "object:133" + } + ], + "yaxis": { + "align": false, + "alignLevel": null + }, + "bars": false, + "dashes": false, + "decimals": null, + "error": false, + "fillGradient": 0, + "hiddenSeries": false, + "percentage": false, + "points": false, + "stack": false, + "steppedLine": false, + "timeFrom": null, + "timeShift": null } ], "repeat": null, diff --git a/server/grpc_service.go b/server/grpc_service.go index a311b8aedc3..814fa899531 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -1359,7 +1359,7 @@ func (s *GrpcServer) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error } // GetRegion implements gRPC PDServer. -func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionRequest) (*pdpb.GetRegionResponse, error) { +func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionRequest) (resp *pdpb.GetRegionResponse, err error) { failpoint.Inject("rateLimit", func() { failpoint.Return(nil, errs.ErrGRPCRateLimitExceeded(errs.ErrRateLimitExceeded)) }) @@ -1384,6 +1384,9 @@ func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionReque rc *cluster.RaftCluster region *core.RegionInfo ) + defer func() { + incRegionRequestCounter("GetRegion", request.Header, resp.Header.Error) + }() if *followerHandle { rc = s.cluster if !rc.GetRegionSyncer().IsRunning() { @@ -1422,7 +1425,7 @@ func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionReque } // GetPrevRegion implements gRPC PDServer -func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionRequest) (*pdpb.GetRegionResponse, error) { +func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionRequest) (resp *pdpb.GetRegionResponse, err error) { done, err := s.rateLimitCheck() if err != nil { return nil, err @@ -1440,6 +1443,9 @@ func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionR return rsp.(*pdpb.GetRegionResponse), err } + defer func() { + incRegionRequestCounter("GetPrevRegion", request.Header, resp.Header.Error) + }() var rc *cluster.RaftCluster if *followerHandle { // no need to check running status @@ -1477,7 +1483,7 @@ func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionR } // GetRegionByID implements gRPC PDServer. -func (s *GrpcServer) GetRegionByID(ctx context.Context, request *pdpb.GetRegionByIDRequest) (*pdpb.GetRegionResponse, error) { +func (s *GrpcServer) GetRegionByID(ctx context.Context, request *pdpb.GetRegionByIDRequest) (resp *pdpb.GetRegionResponse, err error) { done, err := s.rateLimitCheck() if err != nil { return nil, err @@ -1495,6 +1501,9 @@ func (s *GrpcServer) GetRegionByID(ctx context.Context, request *pdpb.GetRegionB return rsp.(*pdpb.GetRegionResponse), err } + defer func() { + incRegionRequestCounter("GetRegionByID", request.Header, resp.Header.Error) + }() var rc *cluster.RaftCluster if *followerHandle { rc = s.cluster @@ -1584,6 +1593,10 @@ func (s *GrpcServer) QueryRegion(stream pdpb.PD_QueryRegionServer) error { PrevKeyIdMap: prevKeyIDMap, RegionsById: regionsByID, } + incRegionRequestCounter("QueryRegion", request.Header, response.Header.Error) + + regionRequestCounter.WithLabelValues("QueryRegion", request.Header.CallerId, + request.Header.CallerComponent, "").Inc() if err := stream.Send(response); err != nil { return errors.WithStack(err) } @@ -1592,7 +1605,7 @@ func (s *GrpcServer) QueryRegion(stream pdpb.PD_QueryRegionServer) error { // Deprecated: use BatchScanRegions instead. // ScanRegions implements gRPC PDServer. -func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsRequest) (*pdpb.ScanRegionsResponse, error) { +func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsRequest) (resp *pdpb.ScanRegionsResponse, err error) { done, err := s.rateLimitCheck() if err != nil { return nil, err @@ -1610,6 +1623,9 @@ func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsR return rsp.(*pdpb.ScanRegionsResponse), nil } + defer func() { + incRegionRequestCounter("ScanRegions", request.Header, resp.Header.Error) + }() var rc *cluster.RaftCluster if *followerHandle { rc = s.cluster @@ -1626,7 +1642,7 @@ func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsR if *followerHandle && len(regions) == 0 { return &pdpb.ScanRegionsResponse{Header: regionNotFound()}, nil } - resp := &pdpb.ScanRegionsResponse{Header: wrapHeader()} + resp = &pdpb.ScanRegionsResponse{Header: wrapHeader()} for _, r := range regions { leader := r.GetLeader() if leader == nil { @@ -1646,7 +1662,7 @@ func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsR } // BatchScanRegions implements gRPC PDServer. -func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchScanRegionsRequest) (*pdpb.BatchScanRegionsResponse, error) { +func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchScanRegionsRequest) (resp *pdpb.BatchScanRegionsResponse, err error) { done, err := s.rateLimitCheck() if err != nil { return nil, err @@ -1664,6 +1680,10 @@ func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchSc return rsp.(*pdpb.BatchScanRegionsResponse), nil } + defer func() { + incRegionRequestCounter("BatchScanRegions", request.Header, resp.Header.Error) + }() + var rc *cluster.RaftCluster if *followerHandle { rc = s.cluster @@ -1729,7 +1749,7 @@ func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchSc if *followerHandle && len(regions) == 0 { return &pdpb.BatchScanRegionsResponse{Header: regionNotFound()}, nil } - resp := &pdpb.BatchScanRegionsResponse{Header: wrapHeader(), Regions: regions} + resp = &pdpb.BatchScanRegionsResponse{Header: wrapHeader(), Regions: regions} return resp, nil } diff --git a/server/metrics.go b/server/metrics.go index 709ce95ad70..f0c601e441d 100644 --- a/server/metrics.go +++ b/server/metrics.go @@ -14,7 +14,10 @@ package server -import "github.com/prometheus/client_golang/prometheus" +import ( + "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/prometheus/client_golang/prometheus" +) var ( timeJumpBackCounter = prometheus.NewCounter( @@ -177,6 +180,14 @@ var ( Help: "Bucketed histogram of processing time (s) of handled forward tso requests.", Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }) + + regionRequestCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "server", + Name: "region_request_cnt", + Help: "Counter of region request.", + }, []string{"request", "caller_id", "caller_component", "err_msg"}) ) func init() { @@ -199,4 +210,23 @@ func init() { prometheus.MustRegister(apiConcurrencyGauge) prometheus.MustRegister(forwardFailCounter) prometheus.MustRegister(forwardTsoDuration) + prometheus.MustRegister(regionRequestCounter) +} + +func incRegionRequestCounter(method string, header *pdpb.RequestHeader, err *pdpb.Error) { + var ( + errMsg = "" + callerId = header.CallerId + callerComponent = header.CallerComponent + ) + if err != nil { + errMsg = err.Type.String() + } + if callerId == "" { + callerId = "unknown" + } + if callerComponent == "" { + callerComponent = "unknown" + } + regionRequestCounter.WithLabelValues(method, callerId, callerComponent, errMsg).Inc() }