Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: add region request metrics which record caller info #9117

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions client/pkg/caller/caller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,27 @@ type (
const (
// TestID is used for test.
TestID ID = "test"

// TestComponent is used for test.
TestComponent Component = "test"

// client-go component
CodecPDClient Component = "codec-pd-client"
RegionCache Component = "region-cache"
StoreCache Component = "store-cache"
Oracles Component = "oracle"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Oracles Component = "oracle"
Oracle Component = "oracle"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is it used?

Copy link
Member Author

@okJiang okJiang Mar 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is used in other repo https://github.com/tikv/pd/pull/9117/files/ebd66e5b0042353ad6402f6239ab049a438c21a0#r1982716822

Did I misunderstand your comment? I thought what you needed was the current PR like. tikv/client-go#1516 (comment)

Rawkv Component = "rawkv"
KvStore Component = "kv-store"
InterceptedPDClient Component = "intercepted-pd-client"

// TiDB component
Pitr Component = "pitr"
Ddl Component = "ddl"
ImportInto Component = "import-into"
TikvHandler Component = "tikv-handler"
GcWorker Component = "gc-worker"
GcJob Component = "gc-job"
DistributedGcJob Component = "distributed-gc-job"
Comment on lines +38 to +54
Copy link
Member Author

@okJiang okJiang Mar 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These names can be reviewed with these two PR https://github.com/pingcap/tidb/pull/59911/files and tikv/client-go#1516

)

var processName ID
Expand Down
222 changes: 222 additions & 0 deletions metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -9157,6 +9157,228 @@
"steppedLine": false,
"timeFrom": null,
"timeShift": null
},
{
"aliasColors": {},
"dashLength": 10,
"datasource": "${DS_TEST-CLUSTER}",
"description": "The region request from different source.",
"editable": true,
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"grid": {},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 135
},
"id": 906,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 300,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"paceLength": 10,
"pluginVersion": "7.5.17",
"pointradius": 5,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"targets": [
{
"expr": "sum(rate(pd_server_region_request_cnt{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (request, caller_id, caller_component)",
"legendFormat": "{{request}}-{{caller_id}}->{{caller_component}}",
"interval": "",
"exemplar": true,
"intervalFactor": 2,
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeRegions": [],
"title": "Total Region Request",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ops",
"label": null,
"logBase": 10,
"max": null,
"min": null,
"show": true,
"$$hashKey": "object:132"
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true,
"$$hashKey": "object:133"
}
],
"yaxis": {
"align": false,
"alignLevel": null
},
"bars": false,
"dashes": false,
"decimals": null,
"error": false,
"fillGradient": 0,
"hiddenSeries": false,
"percentage": false,
"points": false,
"stack": false,
"steppedLine": false,
"timeFrom": null,
"timeShift": null
},
{
"aliasColors": {},
"dashLength": 10,
"datasource": "${DS_TEST-CLUSTER}",
"description": "The region request from different source.",
"editable": true,
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"grid": {},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 143
},
"id": 907,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 300,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"paceLength": 10,
"pluginVersion": "7.5.17",
"pointradius": 5,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"targets": [
{
"expr": "sum(rate(pd_server_region_request_cnt{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", err_msg!=\"\"}[1m])) by (request, caller_id, caller_component)",
"legendFormat": "{{request}}-{{caller_id}}->{{caller_component}}",
"interval": "",
"exemplar": true,
"intervalFactor": 2,
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeRegions": [],
"title": "Failed Region Request",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ops",
"label": null,
"logBase": 10,
"max": null,
"min": null,
"show": true,
"$$hashKey": "object:132"
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true,
"$$hashKey": "object:133"
}
],
"yaxis": {
"align": false,
"alignLevel": null
},
"bars": false,
"dashes": false,
"decimals": null,
"error": false,
"fillGradient": 0,
"hiddenSeries": false,
"percentage": false,
"points": false,
"stack": false,
"steppedLine": false,
"timeFrom": null,
"timeShift": null
}
],
"repeat": null,
Expand Down
34 changes: 27 additions & 7 deletions server/grpc_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -1359,7 +1359,7 @@ func (s *GrpcServer) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error
}

// GetRegion implements gRPC PDServer.
func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionRequest) (*pdpb.GetRegionResponse, error) {
func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionRequest) (resp *pdpb.GetRegionResponse, err error) {
failpoint.Inject("rateLimit", func() {
failpoint.Return(nil, errs.ErrGRPCRateLimitExceeded(errs.ErrRateLimitExceeded))
})
Expand All @@ -1384,6 +1384,9 @@ func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionReque
rc *cluster.RaftCluster
region *core.RegionInfo
)
defer func() {
incRegionRequestCounter("GetRegion", request.Header, resp.Header.Error)
}()
if *followerHandle {
rc = s.cluster
if !rc.GetRegionSyncer().IsRunning() {
Expand Down Expand Up @@ -1422,7 +1425,7 @@ func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionReque
}

// GetPrevRegion implements gRPC PDServer
func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionRequest) (*pdpb.GetRegionResponse, error) {
func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionRequest) (resp *pdpb.GetRegionResponse, err error) {
done, err := s.rateLimitCheck()
if err != nil {
return nil, err
Expand All @@ -1440,6 +1443,9 @@ func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionR
return rsp.(*pdpb.GetRegionResponse), err
}

defer func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we extract one function for this? and get the call name by another way.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't there only one function right now?

incRegionRequestCounter("GetPrevRegion", request.Header, resp.Header.Error)
}()
var rc *cluster.RaftCluster
if *followerHandle {
// no need to check running status
Expand Down Expand Up @@ -1477,7 +1483,7 @@ func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionR
}

// GetRegionByID implements gRPC PDServer.
func (s *GrpcServer) GetRegionByID(ctx context.Context, request *pdpb.GetRegionByIDRequest) (*pdpb.GetRegionResponse, error) {
func (s *GrpcServer) GetRegionByID(ctx context.Context, request *pdpb.GetRegionByIDRequest) (resp *pdpb.GetRegionResponse, err error) {
done, err := s.rateLimitCheck()
if err != nil {
return nil, err
Expand All @@ -1495,6 +1501,9 @@ func (s *GrpcServer) GetRegionByID(ctx context.Context, request *pdpb.GetRegionB
return rsp.(*pdpb.GetRegionResponse), err
}

defer func() {
incRegionRequestCounter("GetRegionByID", request.Header, resp.Header.Error)
}()
var rc *cluster.RaftCluster
if *followerHandle {
rc = s.cluster
Expand Down Expand Up @@ -1584,6 +1593,10 @@ func (s *GrpcServer) QueryRegion(stream pdpb.PD_QueryRegionServer) error {
PrevKeyIdMap: prevKeyIDMap,
RegionsById: regionsByID,
}
incRegionRequestCounter("QueryRegion", request.Header, response.Header.Error)

regionRequestCounter.WithLabelValues("QueryRegion", request.Header.CallerId,
request.Header.CallerComponent, "").Inc()
if err := stream.Send(response); err != nil {
return errors.WithStack(err)
}
Expand All @@ -1592,7 +1605,7 @@ func (s *GrpcServer) QueryRegion(stream pdpb.PD_QueryRegionServer) error {

// Deprecated: use BatchScanRegions instead.
// ScanRegions implements gRPC PDServer.
func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsRequest) (*pdpb.ScanRegionsResponse, error) {
func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsRequest) (resp *pdpb.ScanRegionsResponse, err error) {
done, err := s.rateLimitCheck()
if err != nil {
return nil, err
Expand All @@ -1610,6 +1623,9 @@ func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsR
return rsp.(*pdpb.ScanRegionsResponse), nil
}

defer func() {
incRegionRequestCounter("ScanRegions", request.Header, resp.Header.Error)
}()
var rc *cluster.RaftCluster
if *followerHandle {
rc = s.cluster
Expand All @@ -1626,7 +1642,7 @@ func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsR
if *followerHandle && len(regions) == 0 {
return &pdpb.ScanRegionsResponse{Header: regionNotFound()}, nil
}
resp := &pdpb.ScanRegionsResponse{Header: wrapHeader()}
resp = &pdpb.ScanRegionsResponse{Header: wrapHeader()}
for _, r := range regions {
leader := r.GetLeader()
if leader == nil {
Expand All @@ -1646,7 +1662,7 @@ func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsR
}

// BatchScanRegions implements gRPC PDServer.
func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchScanRegionsRequest) (*pdpb.BatchScanRegionsResponse, error) {
func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchScanRegionsRequest) (resp *pdpb.BatchScanRegionsResponse, err error) {
done, err := s.rateLimitCheck()
if err != nil {
return nil, err
Expand All @@ -1664,6 +1680,10 @@ func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchSc
return rsp.(*pdpb.BatchScanRegionsResponse), nil
}

defer func() {
incRegionRequestCounter("BatchScanRegions", request.Header, resp.Header.Error)
}()

var rc *cluster.RaftCluster
if *followerHandle {
rc = s.cluster
Expand Down Expand Up @@ -1729,7 +1749,7 @@ func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchSc
if *followerHandle && len(regions) == 0 {
return &pdpb.BatchScanRegionsResponse{Header: regionNotFound()}, nil
}
resp := &pdpb.BatchScanRegionsResponse{Header: wrapHeader(), Regions: regions}
resp = &pdpb.BatchScanRegionsResponse{Header: wrapHeader(), Regions: regions}
return resp, nil
}

Expand Down
Loading
Loading